funeralhomes-grab/pipeline.py

247 lines
8.5 KiB
Python
Raw Normal View History

2022-05-01 22:51:51 +00:00
###################
###GEOMETRY DASH###
###GRAB SCRIPTS####
###################
2022-05-18 01:36:37 +00:00
2022-05-01 22:51:51 +00:00
# Based heavily off of ArchiveTeam/urls-grab
2022-05-18 01:36:37 +00:00
import seesaw
2022-05-01 22:51:51 +00:00
from seesaw.project import *
from seesaw.tracker import *
from seesaw.util import *
from seesaw.pipeline import Pipeline
2022-05-18 01:36:37 +00:00
from seesaw.externalprocess import WgetDownload
from seesaw.item import ItemInterpolation, ItemValue
from seesaw.task import SimpleTask, LimitConcurrent
2022-05-01 22:51:51 +00:00
import hashlib
import shutil
import socket
import sys
import json
import time
2022-05-18 01:36:37 +00:00
2022-05-01 22:51:51 +00:00
project = Project(
title = "No",
2022-05-01 22:51:51 +00:00
project_html = """
<h2>Funeral homes</h2>
<p>Archiving funeral homes, because who else will?</p>
2022-05-01 22:51:51 +00:00
""",
)
2022-05-18 01:36:37 +00:00
###########################################################################
2022-05-01 22:51:51 +00:00
# The version number of this pipeline definition.
2022-05-18 01:36:37 +00:00
#
2022-05-01 22:51:51 +00:00
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20220428.01'
#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
2022-06-08 22:51:09 +00:00
TRACKER_ID = 'funeralhomes'
2022-05-01 22:51:51 +00:00
TRACKER_HOST = '172.17.0.1:8501'
2022-05-18 01:36:37 +00:00
WGET_AT = find_executable(
'Wget+AT',
[
'GNU Wget 1.20.3-at.20211001.01'
],
[
'./wget-at',
'/home/warrior/data/wget-at'
]
)
if not WGET_AT:
raise Exception('No usable Wget+At found.')
class CheckIP(SimpleTask):
def __init__(self):
SimpleTask.__init__(self, 'CheckIP')
self._counter = 0
def process(self, item):
# NEW for 2014! Check if we are behind firewall/proxy
if self._counter <= 0:
item.log_output('Checking IP address.')
ip_set = set()
ip_set.add(socket.gethostbyname('twitter.com'))
#ip_set.add(socket.gethostbyname('facebook.com'))
ip_set.add(socket.gethostbyname('youtube.com'))
ip_set.add(socket.gethostbyname('microsoft.com'))
ip_set.add(socket.gethostbyname('icanhas.cheezburger.com'))
ip_set.add(socket.gethostbyname('archiveteam.org'))
if len(ip_set) != 5:
item.log_output('Got IP addresses: {0}'.format(ip_set))
item.log_output(
'Are you behind a firewall/proxy? That is a big no-no!')
raise Exception(
'Are you behind a firewall/proxy? That is a big no-no!')
# Check only occasionally
if self._counter <= 0:
self._counter = 10
else:
self._counter -= 1
class PrepareDirectories(SimpleTask):
def __init__(self, warc_prefix):
SimpleTask.__init__(self, 'PrepareDirectories')
self.warc_prefix = warc_prefix
def process(self, item):
item_name = item['item_name']
item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
escaped_item_name = item_name_hash
dirname = '/'.join((item['data_dir'], escaped_item_name))
if os.path.isdir(dirname):
shutil.rmtree(dirname)
os.makedirs(dirname)
item['item_dir'] = dirname
item['warc_file_base'] = '-'.join([
self.warc_prefix,
item_name_hash,
time.strftime('%Y%m%d-%H%M%S')
])
2022-05-01 22:51:51 +00:00
open('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, 'w').close()
open('%(item_dir)s/%(warc_file_base)s_retry-urls.txt' % item, 'w').close()
2022-05-18 01:36:37 +00:00
def get_hash(filename):
with open(filename, 'rb') as in_file:
return hashlib.sha1(in_file.read()).hexdigest()
CWD = os.getcwd()
PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))
2022-05-01 22:51:51 +00:00
LUA_SHA1 = get_hash(os.path.join(CWD, 'grab.lua'))
2022-05-18 01:36:37 +00:00
def stats_id_function(item):
d = {
'pipeline_hash': PIPELINE_SHA1,
'lua_hash': LUA_SHA1,
'python_version': sys.version,
}
return d
2022-05-01 22:51:51 +00:00
class MoveFiles(SimpleTask):
def __init__(self):
SimpleTask.__init__(self, 'MoveFiles')
2022-05-18 01:36:37 +00:00
2022-05-01 22:51:51 +00:00
def process(self, item):
item["ts"] = time.time()
item["dd"] = item["data_dir"].lstrip("grab/data/")
shutil.move('%(item_dir)s/' % item,
'/finished/%(dd)s_%(item_name)s_%(ts)s/' % item)
2022-05-18 01:36:37 +00:00
class WgetArgs(object):
def realize(self, item):
wget_args = [
'timeout', '3600',
2022-05-18 01:36:37 +00:00
WGET_AT,
'-v',
'--content-on-error',
2022-05-01 22:51:51 +00:00
'--lua-script', 'grab.lua',
2022-05-18 01:36:37 +00:00
'-o', ItemInterpolation('%(item_dir)s/wget.log'),
#'--no-check-certificate',
'--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
'--truncate-output',
'-e', 'robots=off',
'--rotate-dns',
'--page-requisites',
2022-05-18 01:36:37 +00:00
'--timeout', '10',
2022-05-01 22:51:51 +00:00
'--tries', '10',
2022-05-18 01:36:37 +00:00
'--span-hosts',
'--waitretry', '0',
'-w', '1',
'--random-wait',
2022-05-18 01:36:37 +00:00
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
2022-05-01 22:51:51 +00:00
'--warc-header', 'operator: TheTechRobo <thetechrobo@protonmail.ch>',
'--warc-header', json.dumps(stats_id_function(item)),
2022-05-18 01:36:37 +00:00
'--warc-header', 'x-wget-at-project-version: ' + VERSION,
'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
'--warc-dedup-url-agnostic',
2022-05-01 22:51:51 +00:00
'--header', 'Contact: Discord TheTechRobo#7420',
2022-05-18 01:36:37 +00:00
'--header', 'Connection: keep-alive',
'-U', 'Mozilla/5.0 (Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0 ; Operator: TheTechRobo thetechrobo@protonmail.ch',
2022-05-18 01:36:37 +00:00
]
item['item_name_newline'] = item['item_name'].replace('\0', '\n')
item_urls = []
custom_items = {}
for item_name in item['item_name'].split('\0'):
wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
wget_args.append('item-name://'+item_name)
i_n = item_name.split(':')
if i_n[0] == 'downsandson':
item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html']
2022-05-17 21:14:39 +00:00
elif i_n[0] == 'tharpsontheimerfh':
item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/guest-book', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/photo-album', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/services']
elif i_n[0] == 'bestattung-mullner':
ep = 'https://www.bestattung-muellner.at%s/sterbefall/%s/'
eps = []
for language in ('', '/en', '/sk'): # de, en, sk
eps.append(ep % (language, i_n[1]))
eps.append(ep % (language, i_n[1]) + '?action=parte')
eps.append(ep % (language, i_n[1]) + '?action=sterbebild')
eps.append(ep % (language, i_n[1]) + '?action=gedenkkerzen')
eps.append(ep % (language, i_n[1]) + '?action=kondolenzbuch')
item_name = eps
2022-05-17 21:14:39 +00:00
else:
raise TypeError("bad item type")
item_urls+=(item_name)
wget_args+=(item_name)
2022-05-18 01:36:37 +00:00
item['item_urls'] = item_urls
item['custom_items'] = json.dumps(custom_items)
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
pipeline = Pipeline(
2022-05-01 22:51:51 +00:00
CheckIP(),
GetItemFromTracker('http://{}/{}'
.format(TRACKER_HOST, TRACKER_ID),
downloader, VERSION),
PrepareDirectories(warc_prefix='funeralhome'),
2022-05-01 22:51:51 +00:00
WgetDownload(
WgetArgs(),
max_tries=1,
accept_on_exit_code=[0, 4, 8],
env={
'item_dir': ItemValue('item_dir'),
'item_name': ItemValue('item_name_newline'),
'custom_items': ItemValue('custom_items'),
'warc_file_base': ItemValue('warc_file_base')
}
2022-05-18 01:36:37 +00:00
),
2022-05-14 01:31:35 +00:00
#CheckLandslide(),
2022-05-01 22:51:51 +00:00
PrepareStatsForTracker(
defaults={'downloader': downloader, 'version': VERSION},
file_groups={
'data': [
ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')
]
},
id_function=stats_id_function,
),
MoveFiles(),
SendDoneToTracker(
tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
stats=ItemValue('stats')
)
)