diff --git a/Makefile b/Makefile index fb15271..bcb0c8d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ run: make clean docker build -t img . - docker run -v "/media/thetechrobo/2tb/obitdata:/finished" --rm img TheTechRobo --concurrent 1 + docker run -v "/media/thetechrobo/2tb/spoll:/finished" --rm img TheTechRobo --concurrent 1 clean: rm -rf img diff --git a/disco-graphy.py b/disco-graphy.py new file mode 100644 index 0000000..732f232 --- /dev/null +++ b/disco-graphy.py @@ -0,0 +1,10 @@ +import os +#data = "\n".join([str(x) for x in range(1, 50000001)]) +curl = """curl -i 'http://localhost:8501/strawpoll/admin/queues' -X POST -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br' -H 'Content-Type: multipart/form-data; boundary=---------------------------308383748137752883524073922649' -H 'Origin: http://localhost:8501' -H 'Authorization: Basic dGhldGVjaHJvYm86ZWxpdmVJU3RoZWJlc3Q0OQ==' -H 'Connection: keep-alive' -H 'Referer: http://localhost:8501/strawpoll/admin/queues' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: same-origin' -H 'Sec-Fetch-User: ?1' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' --data-binary $'-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="queue"\r\n\r\ntodo1\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="downloader"\r\n\r\n\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="check"\r\n\r\nyes\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="items"\r\n\r\n%(items)s\r\n-----------------------------308383748137752883524073922649\r\nContent-Disposition: form-data; name="items-file"; filename=""\r\nContent-Type: application/octet-stream\r\n\r\n-----------------------------308383748137752883524073922649--\r\n'""" + +for i in range(1, 50000001): + if i % 5 == 0: + print(i) + os.system(curl % {'items':str(i)}) + +#os.system(curl % data) diff --git a/discovery_TharpsonTheimer.py b/discovery_TharpsonTheimer.py deleted file mode 100644 index c8b5c09..0000000 --- a/discovery_TharpsonTheimer.py +++ /dev/null @@ -1,69 +0,0 @@ -names = [] -import bs4 as bs - -import requests - -def disco(page) -> list: - headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - # 'Accept-Encoding': 'gzip, deflate, br', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Pragma': 'no-cache', - 'Cache-Control': 'no-cache', - } - data = { - 'pg': page, - 'term': '', - 'paginate': '1', - 'ym': '0', - 'showmiddlename': '0', - 'listcity': '0', - 'tgt': 'obitlist', - 'numlistings': '10', - 'sids': '10627', - 'typ': '1', - 'txtsrch': '0', - } - - response = requests.post('https://www.tharpsontheimerfh.com/pax/obitsrch', headers=headers, data=data) - return response.text - -for i in range(0, 191): - soup = bs.BeautifulSoup(disco(i), 'lxml') - for link in soup.select(".obitlist-title a"): - names.append(link.get('href').split("/")[2]) - print(i) - -name_ = (" tharpsontheimerfh:".join(names)) -print(name_) - -headers = { - 'Content-Type': 'multipart/form-data; boundary=---------------------------13612081884110273351021381409', - 'Origin': 'http://localhost:8501', - 'Authorization': 'Basic dGhldGVjaHJvYm86ZWxpdmVJU3RoZWJlc3Q0OQ==', - 'Connection': 'keep-alive', - 'Referer': 'http://localhost:8501/funeralhomes/admin/queues', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'same-origin', - 'Sec-Fetch-User': '?1', - 'Pragma': 'no-cache', - 'Cache-Control': 'no-cache', -} - - -data = '-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="queue"\r\n\r\ntodo\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="downloader"\r\n\r\n\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="check"\r\n\r\nno\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="items"\r\n\r\n%s\r\n-----------------------------13612081884110273351021381409\r\nContent-Disposition: form-data; name="items-file"; filename=""\r\nContent-Type: application/octet-stream\r\n\r\n-----------------------------13612081884110273351021381409--\r\n' - - -print(requests.post('http://localhost:8501/funeralhomes/admin/queues', allow_redirects=False, headers=headers, data=data % name_).headers.get("Location")) - - diff --git a/grab.lua b/grab.lua index 8b122f9..42edd1f 100644 --- a/grab.lua +++ b/grab.lua @@ -10,55 +10,63 @@ end QUEUED_URLS = false +function split(s, sep) + local fields = {} + local pattern = string.format("([^%s]+)", sep) + string.gsub(s, pattern, function(c) fields[#fields + 1] = c end) + return fields +end + function startswith(text, prefix) return text:find(prefix, 1, true) == 1 end +local function contains(table, val) + for i=1,#table do + if table[i] == val then + return true + end + end + return false +end + +wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) + if contains(ign, urlpos.url.url) then + print("Skipping already-retrieved URL " .. urlpos.url.url) + return false + end + return verdict +end + wget.callbacks.httploop_result = function(url, err, http_stat) io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n") + if http_stat.statcode == 200 then + table.insert(ign, url.url) + end end wget.callbacks.get_urls = function(file, url, is_css, iri) local addedUrls = {} local data = readAll(file) - io.stderr:write("Read data\n") - if url:match("https://downsandsonfuneralhome%.com/tribute/details/[^/]+/Dr%-Alex%-Klym/obituary%.html") then - local root = htmlparser.parse(data) - io.stderr:write("Read root\n") - local dataa ={} - dataa.obit = root("#obituary-link-list-item a") - dataa.cond = root("#condolences-link-list-item a") - dataa.serv = root("#service-link-list-item a") - dataa.mems = root("#memories-link-list-item a") - dataa.char = root("#charities-link-list-item a") - dataa.prin = root(".print-obit-btn a") - assert(not dataa.cond[2]) - assert(not dataa.serv[2]) - assert(not dataa.mems[2]) - assert(not dataa.char[2]) - assert(not dataa.prin[2]) - assert(not dataa.obit[2]) -- make sure that there's only one element that fits the criteria - table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.obit[1].attributes.href}) - table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.cond[1].attributes.href}) - table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.serv[1].attributes.href}) - table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.char[1].attributes.href}) - if dataa.prin[1] then - table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.prin[1].attributes.href}) - end - QUEUED_URLS = true - end - if url:match("https://www.tharpsontheimerfh.com/tributes/[^/]+/?$") then - local ok=os.getenv("item_name") - local root=htmlparser.parse(data) - local div =root("#obitsbarV31") - assert(not div[2]) - if div[1] then - local oid=div[1].attributes["data-oid"] - table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/twshgal", post_data="oid=" .. oid}) - table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/obpgsnvn", post_data="sn=tributewall&oid=" .. oid}) - end - table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/prnobit", post_data="ok=" .. ok .. "&fcf=0&bg=1"}) + local item_name = os.getenv("item_name") + if url:match("https://www%.strawpoll%.me/[^/]+/fetch%-comments%?page=.$") then + page = page + 1 + table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page }) + table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page .. "&_=1653344833516"}) end io.stderr:write(table.show(addedUrls, "Added URLs ")) return addedUrls end + +wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time) + local file = io.open(os.getenv("item_dir") .. "/alr_urls.txt", "w") + io.output(file) + io.write(table.concat(ign, "\n")) + io.close(file) + print("Wrote data.") +end + +local _data = readAll(os.getenv("item_dir") .. "/alr_urls.txt") +ign = split(_data, "\n") + +page = 1 diff --git a/pipeline.py b/pipeline.py index 0899b76..358c710 100644 --- a/pipeline.py +++ b/pipeline.py @@ -24,8 +24,8 @@ import time project = Project( title = "No", project_html = """ -

Funeral homes

-

Archiving funeral homes, because who else will?

+

StrawPoll

+

Very popular website, no archives that I know of.

""", ) @@ -34,9 +34,9 @@ project = Project( # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20220428.01' +VERSION = '20220523.1' #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' -TRACKER_ID = 'funeralhomes' +TRACKER_ID = 'strawpoll' TRACKER_HOST = '172.17.0.1:8501' WGET_AT = find_executable( @@ -112,6 +112,9 @@ class PrepareDirectories(SimpleTask): open('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, 'w').close() open('%(item_dir)s/%(warc_file_base)s_retry-urls.txt' % item, 'w').close() + open('%(item_dir)s/alr_urls.txt' % item, 'x').close() + + shutil.copy('/finished/alr_urls.txt', '%(item_dir)s/alr_urls.txt' % item) def get_hash(filename): with open(filename, 'rb') as in_file: @@ -137,6 +140,8 @@ class MoveFiles(SimpleTask): def process(self, item): item["ts"] = time.time() item["dd"] = item["data_dir"].lstrip("grab/data/") + shutil.copy('%(item_dir)s/alr_urls.txt' % item, + '/finished/alr_urls.txt') shutil.move('%(item_dir)s/' % item, '/finished/%(dd)s_%(item_name)s_%(ts)s/' % item) @@ -179,15 +184,9 @@ class WgetArgs(object): for item_name in item['item_name'].split('\0'): wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) - i_n = item_name.split(':') - if i_n[0] == 'downsandson': - item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html'] - elif i_n[0] == 'tharpsontheimerfh': - item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/guest-book', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/photo-album', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/services'] - else: - raise TypeError("bad item type") - item_urls+=(item_name) - wget_args+=(item_name) + urls = [f"https://www.strawpoll.me/{item_name}", f"https://www.strawpoll.me/{item_name}/r", f"https://www.strawpoll.me/{item_name}/fetch-comments?page=1&_=1653344833516", f"https://www.strawpoll.me/{item_name}/fetch-comments?page=1"] + item_urls+=(urls) + wget_args+=(urls) item['item_urls'] = item_urls item['custom_items'] = json.dumps(custom_items) @@ -206,7 +205,7 @@ pipeline = Pipeline( GetItemFromTracker('http://{}/{}' .format(TRACKER_HOST, TRACKER_ID), downloader, VERSION), - PrepareDirectories(warc_prefix='funeralhome'), + PrepareDirectories(warc_prefix='strawpool'), WgetDownload( WgetArgs(), max_tries=1,