funeralhomes-grab/urls.lua

local urlparse = require("socket.url")
local http = require("socket.http")
JSON = (loadfile "JSON.lua")()

local item_dir = os.getenv("item_dir")
local item_name = os.getenv("item_name")
local custom_items = os.getenv("custom_items")
local warc_file_base = os.getenv("warc_file_base")

local url_count = 0
local downloaded = {}
local abortgrab = false
local exit_url = false
local min_dedup_mb = 5

local timestamp = nil

if urlparse == nil or http == nil then
  io.stdout:write("socket not corrently installed.\n")
  io.stdout:flush()
  abortgrab = true
end

local urls = {}
for url in string.gmatch(item_name, "([^\n]+)") do
  urls[string.lower(url)] = true
end

local urls_settings = JSON:decode(custom_items)
for k, _ in pairs(urls_settings) do
  urls[string.lower(k)] = true
end

local status_code = nil

local redirect_urls = {}
local visited_urls = {}
local ids_to_ignore = {}
for _, lengths in pairs({{8, 4, 4, 4, 12}, {8, 4, 4, 12}}) do
  local uuid = ""
  for _, i in pairs(lengths) do
    for j=1,i do
      uuid = uuid .. "[0-9a-fA-F]"
    end
    if i ~= 12 then
      uuid = uuid .. "%-"
    end
  end
  ids_to_ignore[uuid] = true
end
local to_ignore = ""
for i=1,9 do
  to_ignore = to_ignore .. "[0-9]"
end
ids_to_ignore["%?" .. to_ignore .. "$"] = true
ids_to_ignore["%?" .. to_ignore .. "[0-9]$"] = true
ids_to_ignore[to_ignore .. "[0-9]%.[0-9][0-9][0-9][0-9]$"] = true
to_ignore = ""
for i=1,50 do
  to_ignore = to_ignore .. "[0-9a-zA-Z]"
end
ids_to_ignore[to_ignore .. "%-[0-9][0-9][0-9][0-9][0-9]"] = true
ids_to_ignore["[0-9a-zA-Z%-_]!%-?[0-9]"] = true
to_ignore = ""
for i=1,32 do
  to_ignore = to_ignore .. "[0-9a-fA-F]"
end
ids_to_ignore["[^0-9a-fA-F]" .. to_ignore .. "[^0-9a-fA-F]"] = true
ids_to_ignore["[^0-9a-fA-F]" .. to_ignore .. "$"] = true

local current_url = nil
local current_settings = nil
local bad_urls = {}
local queued_urls = {}
local bad_params = {}
local bad_patterns = {}
local ignore_patterns = {}
local page_requisite_patterns = {}
local duplicate_urls = {}
local extract_outlinks_patterns = {}
local item_first_url = nil
local redirect_domains = {}
local checked_domains = {}

local parenturl_uuid = nil
local parenturl_requisite = nil

local dupes_file = io.open("duplicate-urls.txt", "r")
for url in dupes_file:lines() do
  duplicate_urls[url] = true
end
dupes_file:close()

local bad_params_file = io.open("bad-params.txt", "r")
for param in bad_params_file:lines() do
  local param = string.gsub(
    param, "([a-zA-Z])",
    function(c)
      return "[" .. string.lower(c) .. string.upper(c) .. "]"
    end
  )
  table.insert(bad_params, param)
end
bad_params_file:close()

local bad_patterns_file = io.open("bad-patterns.txt", "r")
for pattern in bad_patterns_file:lines() do
  table.insert(bad_patterns, pattern)
end
bad_patterns_file:close()

local ignore_patterns_file = io.open("ignore-patterns.txt", "r")
for pattern in ignore_patterns_file:lines() do
  table.insert(ignore_patterns, pattern)
end
ignore_patterns_file:close()

local page_requisite_patterns_file = io.open("page-requisite-patterns.txt", "r")
for pattern in page_requisite_patterns_file:lines() do
  table.insert(page_requisite_patterns, pattern)
end
page_requisite_patterns_file:close()

local extract_outlinks_patterns_file = io.open("extract-outlinks-patterns.txt", "r")
for pattern in extract_outlinks_patterns_file:lines() do
  extract_outlinks_patterns[pattern] = true
end
extract_outlinks_patterns_file:close()

read_file = function(file, bytes)
  if not bytes then
    bytes = "*all"
  end
  if file then
    local f = assert(io.open(file))
    local data = f:read(bytes)
    f:close()
    if not data then
      data = ""
    end
    return data
  else
    return ""
  end
end

table_length = function(t)
  local count = 0
  for _ in pairs(t) do
    count = count + 1
  end
  return count
end

check_domain_outlinks = function(url, target)
  local parent = string.match(url, "^https?://([^/]+)")
  while parent do
    if (not target and extract_outlinks_patterns[parent])
      or (target and parent == target) then
      return parent
    end
    parent = string.match(parent, "^[^%.]+%.(.+)$")
  end
  return false
end

bad_code = function(status_code)
  return status_code ~= 200
    and status_code ~= 301
    and status_code ~= 302
    and status_code ~= 303
    and status_code ~= 307
    and status_code ~= 308
    and status_code ~= 404
    and status_code ~= 410
end

find_path_loop = function(url, max_repetitions)
  local tested = {}
  for s in string.gmatch(urlparse.unescape(url), "([^/]+)") do
    s = string.lower(s)
    if not tested[s] then
      if s == "" then
        tested[s] = -2
      else
        tested[s] = 0
      end
    end
    tested[s] = tested[s] + 1
    if tested[s] == max_repetitions then
      return true
    end
  end
  return false
end

percent_encode_url = function(url)
  temp = ""
  for c in string.gmatch(url, "(.)") do
    local b = string.byte(c)
    if b < 32 or b > 126 then
      c = string.format("%%%02X", b)
    end
    temp = temp .. c
  end
  return temp
end

queue_url = function(url, withcustom)
  if not url then
    return nil
  end
  queue_new_urls(url)
  if not string.match(url, "^https?://[^/]+%.") then
    return nil
  end
--local original = url
  load_setting_depth = function(s)
    n = tonumber(current_settings[s])
    if n == nil then
      n = 0
    end
    return n - 1
  end
  url = string.gsub(url, "'%s*%+%s*'", "")
  url = percent_encode_url(url)
  url = string.match(url, "^([^{]+)")
  url = string.match(url, "^([^<]+)")
  url = string.match(url, "^([^\\]+)")
  if current_settings and current_settings["all"] and withcustom then
    local depth = load_setting_depth("depth")
    local keep_random = load_setting_depth("keep_random")
    local keep_all = load_setting_depth("keep_all")
    local any_domain = load_setting_depth("any_domain")
    if depth >= 0 then
      local random = current_settings["random"]
      local all = current_settings["all"]
      if keep_random < 0 or random == "" then
        random = nil
        keep_random = nil
      end
      if keep_all < 0 or all == 0 then
        all = nil
        keep_all = nil
      end
      if any_domain <= 0 then
        any_domain = nil
      end
      local settings = {
        depth=depth,
        all=all,
        keep_all=keep_all,
        random=random,
        keep_random=keep_random,
        url=url,
        any_domain=any_domain
      }
      url = "custom:"
      for _, k in pairs(
        {"all", "any_domain", "depth", "keep_all", "keep_random", "random", "url"}
      ) do
        local v = settings[k]
        if v ~= nil then
          url = url .. k .. "=" .. urlparse.escape(tostring(v)) .. "&"
        end
      end
      url = string.sub(url, 1, -2)
    end
  end
  if not duplicate_urls[url] and not queued_urls[url] then
    if find_path_loop(url, 2) then
      return false
    end
--print("queuing",original, url)
    queued_urls[url] = true
  end
end

queue_monthly_url = function(url)
  local random_s = os.date("%Y%m", timestamp)
  url = percent_encode_url(url)
  queued_urls["custom:random=" .. random_s .. "&url=" .. urlparse.escape(tostring(url))] = true
end

remove_param = function(url, param_pattern)
  local newurl = url
  repeat
    url = newurl
    newurl = string.gsub(url, "([%?&;])" .. param_pattern .. "=[^%?&;]*[%?&;]?", "%1")
  until newurl == url
  return string.match(newurl, "^(.-)[%?&;]?$")
end

queue_new_urls = function(url)
  if not url then
    return nil
  end
  local newurl = string.gsub(url, "([%?&;])[aA][mM][pP];", "%1")
  if url == current_url then
    if newurl ~= url then
      queue_url(newurl)
    end
  end
  for _, param_pattern in pairs(bad_params) do
    newurl = remove_param(newurl, param_pattern)
  end
  if newurl ~= url then
    queue_url(newurl)
  end
  newurl = string.match(newurl, "^([^%?&]+)")
  if newurl ~= url then
    queue_url(newurl)
  end
  url = string.gsub(url, "&quot;", '"')
  url = string.gsub(url, "&amp;", "&")
  for newurl in string.gmatch(url, '([^"\\]+)') do
    if newurl ~= url then
      queue_url(newurl)
    end
  end
end

report_bad_url = function(url)
  if current_url ~= nil then
    bad_urls[current_url] = true
  else
    bad_urls[string.lower(url)] = true
  end
end

strip_url = function(url)
  url = string.match(url, "^https?://(.+)$")
  newurl = string.match(url, "^www%.(.+)$")
  if newurl then
    url = newurl
  end
  return url
end

wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
  local url = urlpos["url"]["url"]
  local parenturl = parent["url"]
  local extract_page_requisites = false

  local current_settings_all = current_settings and current_settings["all"]
  local current_settings_any_domain = current_settings and current_settings["any_domain"]

  --queue_monthly_url(string.match(url, "^(https?://[^/]+)") .. "/")

  if redirect_urls[parenturl] and not (
    status_code == 300 and string.match(parenturl, "^https?://[^/]*feb%-web%.ru/")
  ) then
    return true
  end

  if find_path_loop(url, 2) then
    return false
  end

  local _, count = string.gsub(url, "[/%?]", "")
  if count >= 16 then
    return false
  end

  for _, extension in pairs({
    "pdf",
    "doc[mx]?",
    "xls[mx]?",
    "ppt[mx]?",
    "zip",
    "odt",
    "odm",
    "ods",
    "odp",
    "xml",
    "json",
    "torrent"
  }) do
    if string.match(parenturl, "%." .. extension .. "$")
      or string.match(parenturl, "%." .. extension .. "[^a-z0-9A-Z]")
      or string.match(parenturl, "%." .. string.upper(extension) .. "$")
      or string.match(parenturl, "%." .. string.upper(extension) .. "[^a-z0-9A-Z]") then
      return false
    end
    if string.match(url, "%." .. extension .. "$")
      or string.match(url, "%." .. extension .. "[^a-z0-9A-Z]")
      or string.match(url, "%." .. string.upper(extension) .. "$")
      or string.match(url, "%." .. string.upper(extension) .. "[^a-z0-9A-Z]") then
      queue_url(url)
      return false
    end
  end

  local domain_match = checked_domains[item_first_url]
  if not domain_match then
    domain_match = check_domain_outlinks(item_first_url)
    if not domain_match then
      domain_match = "none"
    end
    checked_domains[item_first_url] = domain_match
  end
  if domain_match ~= "none" then
    extract_page_requisites = true
    local newurl_domain = string.match(url, "^https?://([^/]+)")
    local to_queue = true
    for domain, _ in pairs(redirect_domains) do
      if check_domain_outlinks(url, domain) then
        to_queue = false
        break
      end
    end
    if to_queue then
      queue_url(url)
      return false
    end
  end

  --[[if not extract_page_requisites then
    return false
  end]]

  if (status_code < 200 or status_code >= 300 or not verdict)
    and not current_settings_all then
    return false
  end

  --[[if string.len(url) == string.len(parenturl) then
    local good_url = false
    local index1, index2
    temp_url = string.match(url, "^https?://(.+)$")
    temp_parenturl = string.match(parenturl, "^https?://(.+)$")
    local start_index = 1
    repeat
      index1 = string.find(temp_url, "/", start_index)
      index2 = string.find(temp_parenturl, "/", start_index)
      if index1 ~= index2 then
        good_url = true
        break
      end
      if index1 then
        start_index = index1 + 1
      end
    until not index1 or not index2
    if not good_url then
      return false
    end
  end]]

  if parenturl_uuid == nil then
    parenturl_uuid = false
    for old_parent_url, _ in pairs(visited_urls) do
      for id_to_ignore, _ in pairs(ids_to_ignore) do
        if string.match(old_parent_url, id_to_ignore) then
          parenturl_uuid = true
          break
        end
      end
      if parenturl_uuid then
        break
      end
    end
  end
  if parenturl_uuid then
    for id_to_ignore, _ in pairs(ids_to_ignore) do
      if string.match(url, id_to_ignore) and not current_settings_all then
        return false
      end
    end
  end

  if urlpos["link_refresh_p"] ~= 0 then
    queue_url(url)
    return false
  end

  if parenturl_requisite == nil then
    parenturl_requisite = false
    for _, pattern in pairs(page_requisite_patterns) do
      for old_parent_url, _ in pairs(visited_urls) do
        if string.match(old_parent_url, pattern) then
          parenturl_requisite = true
          break
        end
      end
      if parenturl_requisite then
        break
      end
    end
  end
  if parenturl_requisite and not current_settings_all then
    return false
  end

  if urlpos["link_inline_p"] ~= 0 then
    queue_url(url)
    return false
  end

  local current_host = string.match(urlpos["url"]["host"], "([^%.]+%.[^%.]+)$")
  local first_parent_host = string.match(parent["host"], "([^%.]+%.[^%.]+)$")

  if current_url then
    first_parent_host = string.match(current_url .. "/", "^https?://[^/]-([^/%.]+%.[^/%.]+)/")
  end

  if current_settings_all and (
    current_settings_any_domain
    or first_parent_host == current_host
  ) then
    queue_url(url, true)
    return false
  end

  --[[for old_parent_url, _ in pairs(visited_urls) do
    for _, pattern in pairs(page_requisite_patterns) do
      if string.match(old_parent_url, pattern) then
        return false
      end
    end
  end

  for _, pattern in pairs(page_requisite_patterns) do
    if string.match(url, pattern) then
      queue_url(url)
      return false
    end
  end]]
end

wget.callbacks.get_urls = function(file, url, is_css, iri)
  local html = nil

  if url then
    downloaded[url] = true
  end

  local function check(url, headers)
    local url = string.match(url, "^([^#]+)")
    url = string.gsub(url, "&amp;", "&")
    queue_url(url)
  end

  local function checknewurl(newurl, headers)
    if string.match(newurl, "^#") then
      return nil
    end
    if string.match(newurl, "\\[uU]002[fF]") then
      return checknewurl(string.gsub(newurl, "\\[uU]002[fF]", "/"), headers)
    end
    if string.match(newurl, "^https?:////") then
      check(string.gsub(newurl, ":////", "://"), headers)
    elseif string.match(newurl, "^https?://") then
      check(newurl, headers)
    elseif string.match(newurl, "^https?:\\/\\?/") then
      check(string.gsub(newurl, "\\", ""), headers)
    elseif not url then
      return nil
    elseif string.match(newurl, "^\\/") then
      checknewurl(string.gsub(newurl, "\\", ""), headers)
    elseif string.match(newurl, "^//") then
      check(urlparse.absolute(url, newurl), headers)
    elseif string.match(newurl, "^/") then
      check(urlparse.absolute(url, newurl), headers)
    elseif string.match(newurl, "^%.%./") then
      if string.match(url, "^https?://[^/]+/[^/]+/") then
        check(urlparse.absolute(url, newurl), headers)
      else
        checknewurl(string.match(newurl, "^%.%.(/.+)$"), headers)
      end
    elseif string.match(newurl, "^%./") then
      check(urlparse.absolute(url, newurl), headers)
    end
  end

  local function checknewshorturl(newurl, headers)
    if string.match(newurl, "^#") then
      return nil
    end
    if url and string.match(newurl, "^%?") then
      check(urlparse.absolute(url, newurl), headers)
    elseif url and not (string.match(newurl, "^https?:\\?/\\?//?/?")
      or string.match(newurl, "^[/\\]")
      or string.match(newurl, "^%./")
      or string.match(newurl, "^[jJ]ava[sS]cript:")
      or string.match(newurl, "^[mM]ail[tT]o:")
      or string.match(newurl, "^vine:")
      or string.match(newurl, "^android%-app:")
      or string.match(newurl, "^ios%-app:")
      or string.match(newurl, "^%${")) then
      check(urlparse.absolute(url, newurl), headers)
    else
      checknewurl(newurl, headers)
    end
  end

  if (status_code == 200 and current_settings and current_settings["deep_extract"])
    or not url then
    html = read_file(file)
    if not url then
      html = string.gsub(html, "&#160;", " ")
      html = string.gsub(html, "&lt;", "<")
      html = string.gsub(html, "&gt;", ">")
      html = string.gsub(html, "&quot;", '"')
      html = string.gsub(html, "&apos;", "'")
      html = string.gsub(html, "&#(%d+);",
        function(n)
          return string.char(n)
        end
      )
      html = string.gsub(html, "&#x(%d+);",
        function(n)
          return string.char(tonumber(n, 16))
        end
      )
      local temp_html = string.gsub(html, "\n", "")
      for _, remove in pairs({"", "<br/>", "</?p[^>]*>"}) do
        if remove ~= "" then
          temp_html = string.gsub(temp_html, remove, "")
        end
        for newurl in string.gmatch(temp_html, "(https?://[^%s<>#\"'\\`{})%]]+)") do
          while string.match(newurl, "[%.&,!;]$") do
            newurl = string.match(newurl, "^(.+).$")
          end
          check(newurl)
        end
      end
    end
    for newurl in string.gmatch(html, "[^%-][hH][rR][eE][fF]='([^']+)'") do
      checknewshorturl(newurl)
    end
    for newurl in string.gmatch(html, '[^%-][hH][rR][eE][fF]="([^"]+)"') do
      checknewshorturl(newurl)
    end
    for newurl in string.gmatch(string.gsub(html, "&[qQ][uU][oO][tT];", '"'), '"(https?://[^"]+)') do
      checknewurl(newurl)
    end
    for newurl in string.gmatch(string.gsub(html, "&#039;", "'"), "'(https?://[^']+)") do
      checknewurl(newurl)
    end
    if url then
      for newurl in string.gmatch(html, ">%s*([^<%s]+)") do
        checknewurl(newurl)
      end
    end
    --[[for newurl in string.gmatch(html, "%(([^%)]+)%)") do
      checknewurl(newurl)
    end]]
  elseif string.match(url, "^https?://[^/]+/.*[^a-z0-9A-Z][pP][dD][fF]$")
    or string.match(url, "^https?://[^/]+/.*[^a-z0-9A-Z][pP][dD][fF][^a-z0-9A-Z]")
    or string.match(read_file(file, 4), "%%[pP][dD][fF]") then
    io.stdout:write("Extracting links from PDF.\n")
    io.stdout:flush()
    local temp_file = file .. "-html.html"
    local check_file = io.open(temp_file)
    if check_file then
      check_file:close()
      os.remove(temp_file)
    end
    os.execute("pdftohtml -nodrm -hidden -i -s -q " .. file)
    check_file = io.open(temp_file)
    if check_file then
      check_file:close()
      local temp_length = table_length(queued_urls)
      wget.callbacks.get_urls(temp_file, nil, nil, nil)
      io.stdout:write("Found " .. tostring(table_length(queued_urls)-temp_length) .. " URLs.\n")
      io.stdout:flush()
      os.remove(temp_file)
    else
      io.stdout:write("Not a PDF.\n")
      io.stdout:flush()
    end
  end
end

wget.callbacks.write_to_warc = function(url, http_stat)
  local url_lower = string.lower(url["url"])
  if urls[url_lower] then
    current_url = url_lower
    current_settings = urls_settings[url_lower]
  end
  if current_settings and not current_settings["random"] then
    queue_url(url["url"])
    return false
  end
  if bad_code(http_stat["statcode"]) then
    return false
  elseif http_stat["statcode"] >= 300 and http_stat["statcode"] <= 399 then
    local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
    if string.match(newloc, "^https?://[^/]*google%.com/sorry")
      or string.match(newloc, "^https?://[^/]*google%.com/[sS]ervice[lL]ogin")
      or string.match(newloc, "^https?://consent%.youtube%.com/")
      or string.match(newloc, "^https?://consent%.google%.com/")
      or string.match(newloc, "^https?://misuse%.ncbi%.nlm%.nih%.gov/")
      or string.match(newloc, "^https?://myprivacy%.dpgmedia%.nl/")
      or string.match(newloc, "^https?://idp%.springer%.com/authorize%?")
      or string.match(newloc, "^https?://[^/]*instagram%.com/accounts/") then
      report_bad_url(url["url"])
      exit_url = true
      return false
    end
    return true
  elseif http_stat["statcode"] ~= 200 then
    return true
  end
  if true then
    return true
  end
  if http_stat["len"] > min_dedup_mb * 1024 * 1024 then
    io.stdout:write("Data larger than " .. tostring(min_dedup_mb) .. " MB. Checking with Wayback Machine.\n")
    io.stdout:flush()
    while true do
      local body, code, headers, status = http.request(
        "https://web.archive.org/__wb/calendarcaptures/2"
          .. "?url=" .. urlparse.escape(url["url"])
          .. "&date=202"
      )
      if code ~= 200 then
        io.stdout:write("Got " .. tostring(code) .. " from the Wayback Machine.\n")
        io.stdout:flush()
        os.execute("sleep 10")
      else
        data = JSON:decode(body)
        if not data["items"] or not data["colls"] then
          return true
        end
        for _, item in pairs(data["items"]) do
          if item[2] == 200 then
            local coll_id = item[3] + 1
            if not coll_id then
              io.stdout:write("Could get coll ID.\n")
              io.stdout:flush()
            end
            local collections = data["colls"][coll_id]
            if not collections then
              io.stdout:write("Could not get collections.\n")
              io.stdout:flush()
            end
            for _, collection in pairs(collections) do
              if collection == "archivebot"
                or string.find(collection, "archiveteam") then
                io.stdout:write("Archive Team got this URL before.\n")
                return false
              end
            end
          end
        end
        break
      end
    end
  end
  return true
end

wget.callbacks.httploop_result = function(url, err, http_stat)
  status_code = http_stat["statcode"]

  parenturl_uuid = nil
  parenturl_requisite = nil

  local url_lower = string.lower(url["url"])
  if urls[url_lower] then
    current_url = url_lower
    current_settings = urls_settings[url_lower]
  end

  if not timestamp then
    local body, code, headers, status = http.request("https://legacy-api.arpa.li/now")
    assert(code == 200)
    timestamp = tonumber(string.match(body, "^([0-9]+)"))
  end


  if status_code ~= 0 then
    local base_url = string.match(url["url"], "^(https://[^/]+)")
    if base_url then
      for _, newurl in pairs({
        base_url .. "/robots.txt",
        base_url .. "/favicon.ico",
        base_url .. "/"
      }) do
        queue_monthly_url(newurl)
      end
    end
  end

  url_count = url_count + 1
  io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. "  \n")
  io.stdout:flush()

  if redirect_domains["done"] then
    redirect_domains = {}
    redirect_urls = {}
    visited_urls = {}
    item_first_url = nil
  end
  redirect_domains[string.match(url["url"], "^https?://([^/]+)")] = true
  if not item_first_url then
    item_first_url = url["url"]
  end

  visited_urls[url["url"]] = true

  if exit_url then
    exit_url = false
    return wget.actions.EXIT
  end

  if status_code >= 300 and status_code <= 399 then
    local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
    redirect_urls[url["url"]] = true
    --[[if strip_url(url["url"]) == strip_url(newloc) then
      queued_urls[newloc] = true
      return wget.actions.EXIT
    end]]
    if downloaded[newloc] then
      return wget.actions.EXIT
    elseif string.match(url["url"], "^https?://[^/]*telegram%.org/dl%?tme=")
      or (
        string.match(newloc, "^https?://www%.(.+)")
        or string.match(newloc, "^https?://(.+)")
      ) == (
        string.match(url["url"], "^https?://www%.(.+)")
        or string.match(url["url"], "^https?://(.+)")
      )
      or status_code == 301
      or status_code == 308 then
      queue_url(newloc)
      return wget.actions.EXIT
    end
  else
    redirect_domains["done"] = true
  end

  if downloaded[url["url"]] then
    report_bad_url(url["url"])
    return wget.actions.EXIT
  end

  for _, pattern in pairs(ignore_patterns) do
    if string.match(url["url"], pattern) then
      return wget.actions.EXIT
    end
  end

  if status_code >= 200 and status_code <= 399 then
    downloaded[url["url"]] = true
  end

  if status_code >= 200 and status_code < 300 then
    queue_new_urls(url["url"])
  end

  if bad_code(status_code) then
    io.stdout:write("Server returned " .. http_stat.statcode .. " (" .. err .. ").\n")
    io.stdout:flush()
    report_bad_url(url["url"])
    return wget.actions.EXIT
  end

  local sleep_time = 0

  if sleep_time > 0.001 then
    os.execute("sleep " .. sleep_time)
  end

  return wget.actions.NOTHING
end

wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time)
  local function submit_backfeed(newurls)
    local tries = 0
    local maxtries = 4
    while tries < maxtries do
      local body, code, headers, status = http.request(
        "https://legacy-api.arpa.li/backfeed/legacy/urls-glx7ansh4e17aii",
        newurls .. "\0"
      )
      print(body)
      if code == 200 then
        io.stdout:write("Submitted discovered URLs.\n")
        io.stdout:flush()
        break
      end
      io.stdout:write("Failed to submit discovered URLs." .. tostring(code) .. tostring(body) .. "\n")
      io.stdout:flush()
      os.execute("sleep " .. math.floor(math.pow(2, tries)))
      tries = tries + 1
    end
    if tries == maxtries then
      abortgrab = true
    end
  end

  local newurls = nil
  local is_bad = false
  local count = 0
  local dup_urls = io.open(item_dir .. "/" .. warc_file_base .. "_duplicate-urls.txt", "w")
  for url, _ in pairs(queued_urls) do
    for _, pattern in pairs(bad_patterns) do
      is_bad = string.match(url, pattern)
      if is_bad then
        io.stdout:write("Filtering out URL " .. url .. ".\n")
        io.stdout:flush()
        break
      end
    end
    if not is_bad then
      io.stdout:write("Queuing URL " .. url .. ".\n")
      io.stdout:flush()
      dup_urls:write(url .. "\n")
      if newurls == nil then
        newurls = url
      else
        newurls = newurls .. "\0" .. url
      end
      count = count + 1
      if count == 100 then
        submit_backfeed(newurls)
        newurls = nil
        count = 0
      end
    end
  end
  if newurls ~= nil then
    submit_backfeed(newurls)
  end
  dup_urls:close()

  local file = io.open(item_dir .. "/" .. warc_file_base .. "_bad-urls.txt", "w")
  for url, _ in pairs(bad_urls) do
    file:write(url .. "\n")
  end
  file:close()
end

wget.callbacks.before_exit = function(exit_status, exit_status_string)
  if abortgrab then
    return wget.exits.IO_FAIL
  end
  return exit_status
end