diff --git a/grab.lua b/grab.lua index 8b122f9..009fce5 100644 --- a/grab.lua +++ b/grab.lua @@ -59,6 +59,13 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) end table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/prnobit", post_data="ok=" .. ok .. "&fcf=0&bg=1"}) end - io.stderr:write(table.show(addedUrls, "Added URLs ")) + if url:match("^https://www.bestattung%-muellner.at/?.?.?/sterbefall/[^/]+/%\?action=gedenkkerzen") then + local root = htmlparser.parse(data) + local otherPages = root(".pagination .inactive") + for index, pagination in ipairs(otherPages) do + table.insert(addedUrls, { url=pagination.attributes.href}) + end + end + io.stderr:write(table.show(addedUrls, "Added URLs")) return addedUrls end diff --git a/pipeline.py b/pipeline.py index 0899b76..67ecc59 100644 --- a/pipeline.py +++ b/pipeline.py @@ -36,7 +36,7 @@ project = Project( # It will be added to the WARC files and reported to the tracker. VERSION = '20220428.01' #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' -TRACKER_ID = 'funeralhomes' +TRACKER_ID = 'funeralhomestest' TRACKER_HOST = '172.17.0.1:8501' WGET_AT = find_executable( @@ -184,6 +184,16 @@ class WgetArgs(object): item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html'] elif i_n[0] == 'tharpsontheimerfh': item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/guest-book', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/photo-album', f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}/services'] + elif i_n[0] == 'bestattung-mullner': + ep = 'https://www.bestattung-muellner.at%s/sterbefall/%s/' + eps = [] + for language in ('', '/en', '/sk'): # de, en, sk + eps.append(ep % (language, i_n[1])) + eps.append(ep % (language, i_n[1]) + '?action=parte') + eps.append(ep % (language, i_n[1]) + '?action=sterbebild') + eps.append(ep % (language, i_n[1]) + '?action=gedenkkerzen') + eps.append(ep % (language, i_n[1]) + '?action=kondolenzbuch') + item_name = eps else: raise TypeError("bad item type") item_urls+=(item_name)