Init
This commit is contained in:
parent
5bdb49fb80
commit
dd3f82e623
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
run:
|
||||
make clean
|
||||
docker build -t img .
|
||||
docker run -v "/media/thetechrobo/2tb/obitdata:/finished" --rm img TheTechRobo --concurrent 1
|
||||
docker run -v "/media/thetechrobo/2tb/spoll:/finished" --rm img TheTechRobo --concurrent 1
|
||||
|
||||
clean:
|
||||
rm -rf img
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
import os
|
||||
#data = "\n".join([str(x) for x in range(1, 50000001)])
|
||||
curl = """curl -i 'http://localhost:8501/strawpoll/admin/queues' -X POST -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br' -H 'Content-Type: multipart/form-data; boundary=---------------------------308383748137752883524073922649' -H 'Origin: http://localhost:8501' -H 'Authorization: Basic dGhldGVjaHJvYm86ZWxpdmVJU3RoZWJlc3Q0OQ==' -H 'Connection: keep-alive' -H 'Referer: http://localhost:8501/strawpoll/admin/queues' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: same-origin' -H 'Sec-Fetch-User: ?1' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' --data-binary $'-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="queue"\r\n\r\ntodo1\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="downloader"\r\n\r\n\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="check"\r\n\r\nyes\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="items"\r\n\r\n%(items)s\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="items-file"; filename=""\r\nContent-Type: application/octet-stream\r\n\r\n-----------------------------308383748137752883524073922649--\r\n'"""
|
||||
|
||||
for i in range(1, 50000001):
|
||||
if i % 5 == 0:
|
||||
print(i)
|
||||
os.system(curl % {'items':str(i)})
|
||||
|
||||
#os.system(curl % data)
|
|
@ -1,69 +0,0 @@
|
|||
names = []
|
||||
import bs4 as bs
|
||||
|
||||
import requests
|
||||
|
||||
def disco(page) -> list:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Pragma': 'no-cache',
|
||||
'Cache-Control': 'no-cache',
|
||||
}
|
||||
data = {
|
||||
'pg': page,
|
||||
'term': '',
|
||||
'paginate': '1',
|
||||
'ym': '0',
|
||||
'showmiddlename': '0',
|
||||
'listcity': '0',
|
||||
'tgt': 'obitlist',
|
||||
'numlistings': '10',
|
||||
'sids': '10627',
|
||||
'typ': '1',
|
||||
'txtsrch': '0',
|
||||
}
|
||||
|
||||
response = requests.post('https://www.tharpsontheimerfh.com/pax/obitsrch', headers=headers, data=data)
|
||||
return response.text
|
||||
|
||||
for i in range(0, 191):
|
||||
soup = bs.BeautifulSoup(disco(i), 'lxml')
|
||||
for link in soup.select(".obitlist-title a"):
|
||||
names.append(link.get('href').split("/")[2])
|
||||
print(i)
|
||||
|
||||
name_ = (" tharpsontheimerfh:".join(names))
|
||||
print(name_)
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'multipart/form-data; boundary=---------------------------13612081884110273351021381409',
|
||||
'Origin': 'http://localhost:8501',
|
||||
'Authorization': 'Basic dGhldGVjaHJvYm86ZWxpdmVJU3RoZWJlc3Q0OQ==',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'http://localhost:8501/funeralhomes/admin/queues',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Pragma': 'no-cache',
|
||||
'Cache-Control': 'no-cache',
|
||||
}
|
||||
|
||||
|
||||
data = '-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="queue"\r\n\r\ntodo\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="downloader"\r\n\r\n\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="check"\r\n\r\nno\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="items"\r\n\r\n%s\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="items-file"; filename=""\r\nContent-Type: application/octet-stream\r\n\r\n-----------------------------13612081884110273351021381409--\r\n'
|
||||
|
||||
|
||||
print(requests.post('http://localhost:8501/funeralhomes/admin/queues', allow_redirects=False, headers=headers, data=data % name_).headers.get("Location"))
|
||||
|
||||
|
82
grab.lua
82
grab.lua
|
@ -10,55 +10,63 @@ end
|
|||
|
||||
QUEUED_URLS = false
|
||||
|
||||
function split(s, sep)
|
||||
local fields = {}
|
||||
local pattern = string.format("([^%s]+)", sep)
|
||||
string.gsub(s, pattern, function(c) fields[#fields + 1] = c end)
|
||||
return fields
|
||||
end
|
||||
|
||||
function startswith(text, prefix)
|
||||
return text:find(prefix, 1, true) == 1
|
||||
end
|
||||
|
||||
local function contains(table, val)
|
||||
for i=1,#table do
|
||||
if table[i] == val then
|
||||
return true
|
||||
end
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
|
||||
if contains(ign, urlpos.url.url) then
|
||||
print("Skipping already-retrieved URL " .. urlpos.url.url)
|
||||
return false
|
||||
end
|
||||
return verdict
|
||||
end
|
||||
|
||||
wget.callbacks.httploop_result = function(url, err, http_stat)
|
||||
io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n")
|
||||
if http_stat.statcode == 200 then
|
||||
table.insert(ign, url.url)
|
||||
end
|
||||
end
|
||||
|
||||
wget.callbacks.get_urls = function(file, url, is_css, iri)
|
||||
local addedUrls = {}
|
||||
local data = readAll(file)
|
||||
io.stderr:write("Read data\n")
|
||||
if url:match("https://downsandsonfuneralhome%.com/tribute/details/[^/]+/Dr%-Alex%-Klym/obituary%.html") then
|
||||
local root = htmlparser.parse(data)
|
||||
io.stderr:write("Read root\n")
|
||||
local dataa ={}
|
||||
dataa.obit = root("#obituary-link-list-item a")
|
||||
dataa.cond = root("#condolences-link-list-item a")
|
||||
dataa.serv = root("#service-link-list-item a")
|
||||
dataa.mems = root("#memories-link-list-item a")
|
||||
dataa.char = root("#charities-link-list-item a")
|
||||
dataa.prin = root(".print-obit-btn a")
|
||||
assert(not dataa.cond[2])
|
||||
assert(not dataa.serv[2])
|
||||
assert(not dataa.mems[2])
|
||||
assert(not dataa.char[2])
|
||||
assert(not dataa.prin[2])
|
||||
assert(not dataa.obit[2]) -- make sure that there's only one element that fits the criteria
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.obit[1].attributes.href})
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.cond[1].attributes.href})
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.serv[1].attributes.href})
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.char[1].attributes.href})
|
||||
if dataa.prin[1] then
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.prin[1].attributes.href})
|
||||
end
|
||||
QUEUED_URLS = true
|
||||
end
|
||||
if url:match("https://www.tharpsontheimerfh.com/tributes/[^/]+/?$") then
|
||||
local ok=os.getenv("item_name")
|
||||
local root=htmlparser.parse(data)
|
||||
local div =root("#obitsbarV31")
|
||||
assert(not div[2])
|
||||
if div[1] then
|
||||
local oid=div[1].attributes["data-oid"]
|
||||
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/twshgal", post_data="oid=" .. oid})
|
||||
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/obpgsnvn", post_data="sn=tributewall&oid=" .. oid})
|
||||
end
|
||||
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/prnobit", post_data="ok=" .. ok .. "&fcf=0&bg=1"})
|
||||
local item_name = os.getenv("item_name")
|
||||
if url:match("https://www%.strawpoll%.me/[^/]+/fetch%-comments%?page=.$") then
|
||||
page = page + 1
|
||||
table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page })
|
||||
table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page .. "&_=1653344833516"})
|
||||
end
|
||||
io.stderr:write(table.show(addedUrls, "Added URLs "))
|
||||
return addedUrls
|
||||
end
|
||||
|
||||
wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time)
|
||||
local file = io.open(os.getenv("item_dir") .. "/alr_urls.txt", "w")
|
||||
io.output(file)
|
||||
io.write(table.concat(ign, "\n"))
|
||||
io.close(file)
|
||||
print("Wrote data.")
|
||||
end
|
||||
|
||||
local _data = readAll(os.getenv("item_dir") .. "/alr_urls.txt")
|
||||
ign = split(_data, "\n")
|
||||
|
||||
page = 1
|
||||
|
|
27
pipeline.py
27
pipeline.py
|
@ -24,8 +24,8 @@ import time
|
|||
project = Project(
|
||||
title = "No",
|
||||
project_html = """
|
||||
<h2>Funeral homes</h2>
|
||||
<p>Archiving funeral homes, because who else will?</p>
|
||||
<h2>StrawPoll</h2>
|
||||
<p>Very popular website, no archives that I know of.</p>
|
||||
""",
|
||||
)
|
||||
|
||||
|
@ -34,9 +34,9 @@ project = Project(
|
|||
#
|
||||
# Update this each time you make a non-cosmetic change.
|
||||
# It will be added to the WARC files and reported to the tracker.
|
||||
VERSION = '20220428.01'
|
||||
VERSION = '20220523.1'
|
||||
#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
|
||||
TRACKER_ID = 'funeralhomes'
|
||||
TRACKER_ID = 'strawpoll'
|
||||
TRACKER_HOST = '172.17.0.1:8501'
|
||||
|
||||
WGET_AT = find_executable(
|
||||
|
@ -112,6 +112,9 @@ class PrepareDirectories(SimpleTask):
|
|||
|
||||
open('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, 'w').close()
|
||||
open('%(item_dir)s/%(warc_file_base)s_retry-urls.txt' % item, 'w').close()
|
||||
open('%(item_dir)s/alr_urls.txt' % item, 'x').close()
|
||||
|
||||
shutil.copy('/finished/alr_urls.txt', '%(item_dir)s/alr_urls.txt' % item)
|
||||
|
||||
def get_hash(filename):
|
||||
with open(filename, 'rb') as in_file:
|
||||
|
@ -137,6 +140,8 @@ class MoveFiles(SimpleTask):
|
|||
def process(self, item):
|
||||
item["ts"] = time.time()
|
||||
item["dd"] = item["data_dir"].lstrip("grab/data/")
|
||||
shutil.copy('%(item_dir)s/alr_urls.txt' % item,
|
||||
'/finished/alr_urls.txt')
|
||||
shutil.move('%(item_dir)s/' % item,
|
||||
'/finished/%(dd)s_%(item_name)s_%(ts)s/' % item)
|
||||
|
||||
|
@ -179,15 +184,9 @@ class WgetArgs(object):
|
|||
for item_name in item['item_name'].split('\0'):
|
||||
wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
|
||||
wget_args.append('item-name://'+item_name)
|
||||
i_n = item_name.split(':')
|
||||
if i_n[0] == 'downsandson':
|
||||
item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html']
|
||||
elif i_n[0] == 'tharpsontheimerfh':
|
||||
item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/guest-book', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/photo-album', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/services']
|
||||
else:
|
||||
raise TypeError("bad item type")
|
||||
item_urls+=(item_name)
|
||||
wget_args+=(item_name)
|
||||
urls = [f"https://www.strawpoll.me/{item_name}", f"https://www.strawpoll.me/{item_name}/r", f"https://www.strawpoll.me/{item_name}/fetch-comments?page=1&_=1653344833516", f"https://www.strawpoll.me/{item_name}/fetch-comments?page=1"]
|
||||
item_urls+=(urls)
|
||||
wget_args+=(urls)
|
||||
|
||||
item['item_urls'] = item_urls
|
||||
item['custom_items'] = json.dumps(custom_items)
|
||||
|
@ -206,7 +205,7 @@ pipeline = Pipeline(
|
|||
GetItemFromTracker('http://{}/{}'
|
||||
.format(TRACKER_HOST, TRACKER_ID),
|
||||
downloader, VERSION),
|
||||
PrepareDirectories(warc_prefix='funeralhome'),
|
||||
PrepareDirectories(warc_prefix='strawpool'),
|
||||
WgetDownload(
|
||||
WgetArgs(),
|
||||
max_tries=1,
|
||||
|
|
Loading…
Reference in New Issue