943 lines
26 KiB
Lua
943 lines
26 KiB
Lua
local urlparse = require("socket.url")
|
|
local http = require("socket.http")
|
|
JSON = (loadfile "JSON.lua")()
|
|
|
|
local item_dir = os.getenv("item_dir")
|
|
local item_name = os.getenv("item_name")
|
|
local custom_items = os.getenv("custom_items")
|
|
local warc_file_base = os.getenv("warc_file_base")
|
|
|
|
local url_count = 0
|
|
local downloaded = {}
|
|
local abortgrab = false
|
|
local exit_url = false
|
|
local min_dedup_mb = 5
|
|
|
|
local timestamp = nil
|
|
|
|
if urlparse == nil or http == nil then
|
|
io.stdout:write("socket not corrently installed.\n")
|
|
io.stdout:flush()
|
|
abortgrab = true
|
|
end
|
|
|
|
local urls = {}
|
|
for url in string.gmatch(item_name, "([^\n]+)") do
|
|
urls[string.lower(url)] = true
|
|
end
|
|
|
|
local urls_settings = JSON:decode(custom_items)
|
|
for k, _ in pairs(urls_settings) do
|
|
urls[string.lower(k)] = true
|
|
end
|
|
|
|
local status_code = nil
|
|
|
|
local redirect_urls = {}
|
|
local visited_urls = {}
|
|
local ids_to_ignore = {}
|
|
for _, lengths in pairs({{8, 4, 4, 4, 12}, {8, 4, 4, 12}}) do
|
|
local uuid = ""
|
|
for _, i in pairs(lengths) do
|
|
for j=1,i do
|
|
uuid = uuid .. "[0-9a-fA-F]"
|
|
end
|
|
if i ~= 12 then
|
|
uuid = uuid .. "%-"
|
|
end
|
|
end
|
|
ids_to_ignore[uuid] = true
|
|
end
|
|
local to_ignore = ""
|
|
for i=1,9 do
|
|
to_ignore = to_ignore .. "[0-9]"
|
|
end
|
|
ids_to_ignore["%?" .. to_ignore .. "$"] = true
|
|
ids_to_ignore["%?" .. to_ignore .. "[0-9]$"] = true
|
|
ids_to_ignore[to_ignore .. "[0-9]%.[0-9][0-9][0-9][0-9]$"] = true
|
|
to_ignore = ""
|
|
for i=1,50 do
|
|
to_ignore = to_ignore .. "[0-9a-zA-Z]"
|
|
end
|
|
ids_to_ignore[to_ignore .. "%-[0-9][0-9][0-9][0-9][0-9]"] = true
|
|
ids_to_ignore["[0-9a-zA-Z%-_]!%-?[0-9]"] = true
|
|
to_ignore = ""
|
|
for i=1,32 do
|
|
to_ignore = to_ignore .. "[0-9a-fA-F]"
|
|
end
|
|
ids_to_ignore["[^0-9a-fA-F]" .. to_ignore .. "[^0-9a-fA-F]"] = true
|
|
ids_to_ignore["[^0-9a-fA-F]" .. to_ignore .. "$"] = true
|
|
|
|
local current_url = nil
|
|
local current_settings = nil
|
|
local bad_urls = {}
|
|
local queued_urls = {}
|
|
local bad_params = {}
|
|
local bad_patterns = {}
|
|
local ignore_patterns = {}
|
|
local page_requisite_patterns = {}
|
|
local duplicate_urls = {}
|
|
local extract_outlinks_patterns = {}
|
|
local item_first_url = nil
|
|
local redirect_domains = {}
|
|
local checked_domains = {}
|
|
|
|
local parenturl_uuid = nil
|
|
local parenturl_requisite = nil
|
|
|
|
local dupes_file = io.open("duplicate-urls.txt", "r")
|
|
for url in dupes_file:lines() do
|
|
duplicate_urls[url] = true
|
|
end
|
|
dupes_file:close()
|
|
|
|
local bad_params_file = io.open("bad-params.txt", "r")
|
|
for param in bad_params_file:lines() do
|
|
local param = string.gsub(
|
|
param, "([a-zA-Z])",
|
|
function(c)
|
|
return "[" .. string.lower(c) .. string.upper(c) .. "]"
|
|
end
|
|
)
|
|
table.insert(bad_params, param)
|
|
end
|
|
bad_params_file:close()
|
|
|
|
local bad_patterns_file = io.open("bad-patterns.txt", "r")
|
|
for pattern in bad_patterns_file:lines() do
|
|
table.insert(bad_patterns, pattern)
|
|
end
|
|
bad_patterns_file:close()
|
|
|
|
local ignore_patterns_file = io.open("ignore-patterns.txt", "r")
|
|
for pattern in ignore_patterns_file:lines() do
|
|
table.insert(ignore_patterns, pattern)
|
|
end
|
|
ignore_patterns_file:close()
|
|
|
|
local page_requisite_patterns_file = io.open("page-requisite-patterns.txt", "r")
|
|
for pattern in page_requisite_patterns_file:lines() do
|
|
table.insert(page_requisite_patterns, pattern)
|
|
end
|
|
page_requisite_patterns_file:close()
|
|
|
|
local extract_outlinks_patterns_file = io.open("extract-outlinks-patterns.txt", "r")
|
|
for pattern in extract_outlinks_patterns_file:lines() do
|
|
extract_outlinks_patterns[pattern] = true
|
|
end
|
|
extract_outlinks_patterns_file:close()
|
|
|
|
read_file = function(file, bytes)
|
|
if not bytes then
|
|
bytes = "*all"
|
|
end
|
|
if file then
|
|
local f = assert(io.open(file))
|
|
local data = f:read(bytes)
|
|
f:close()
|
|
if not data then
|
|
data = ""
|
|
end
|
|
return data
|
|
else
|
|
return ""
|
|
end
|
|
end
|
|
|
|
table_length = function(t)
|
|
local count = 0
|
|
for _ in pairs(t) do
|
|
count = count + 1
|
|
end
|
|
return count
|
|
end
|
|
|
|
check_domain_outlinks = function(url, target)
|
|
local parent = string.match(url, "^https?://([^/]+)")
|
|
while parent do
|
|
if (not target and extract_outlinks_patterns[parent])
|
|
or (target and parent == target) then
|
|
return parent
|
|
end
|
|
parent = string.match(parent, "^[^%.]+%.(.+)$")
|
|
end
|
|
return false
|
|
end
|
|
|
|
bad_code = function(status_code)
|
|
return status_code ~= 200
|
|
and status_code ~= 301
|
|
and status_code ~= 302
|
|
and status_code ~= 303
|
|
and status_code ~= 307
|
|
and status_code ~= 308
|
|
and status_code ~= 404
|
|
and status_code ~= 410
|
|
end
|
|
|
|
find_path_loop = function(url, max_repetitions)
|
|
local tested = {}
|
|
for s in string.gmatch(urlparse.unescape(url), "([^/]+)") do
|
|
s = string.lower(s)
|
|
if not tested[s] then
|
|
if s == "" then
|
|
tested[s] = -2
|
|
else
|
|
tested[s] = 0
|
|
end
|
|
end
|
|
tested[s] = tested[s] + 1
|
|
if tested[s] == max_repetitions then
|
|
return true
|
|
end
|
|
end
|
|
return false
|
|
end
|
|
|
|
percent_encode_url = function(url)
|
|
temp = ""
|
|
for c in string.gmatch(url, "(.)") do
|
|
local b = string.byte(c)
|
|
if b < 32 or b > 126 then
|
|
c = string.format("%%%02X", b)
|
|
end
|
|
temp = temp .. c
|
|
end
|
|
return temp
|
|
end
|
|
|
|
queue_url = function(url, withcustom)
|
|
if not url then
|
|
return nil
|
|
end
|
|
queue_new_urls(url)
|
|
if not string.match(url, "^https?://[^/]+%.") then
|
|
return nil
|
|
end
|
|
--local original = url
|
|
load_setting_depth = function(s)
|
|
n = tonumber(current_settings[s])
|
|
if n == nil then
|
|
n = 0
|
|
end
|
|
return n - 1
|
|
end
|
|
url = string.gsub(url, "'%s*%+%s*'", "")
|
|
url = percent_encode_url(url)
|
|
url = string.match(url, "^([^{]+)")
|
|
url = string.match(url, "^([^<]+)")
|
|
url = string.match(url, "^([^\\]+)")
|
|
if current_settings and current_settings["all"] and withcustom then
|
|
local depth = load_setting_depth("depth")
|
|
local keep_random = load_setting_depth("keep_random")
|
|
local keep_all = load_setting_depth("keep_all")
|
|
local any_domain = load_setting_depth("any_domain")
|
|
if depth >= 0 then
|
|
local random = current_settings["random"]
|
|
local all = current_settings["all"]
|
|
if keep_random < 0 or random == "" then
|
|
random = nil
|
|
keep_random = nil
|
|
end
|
|
if keep_all < 0 or all == 0 then
|
|
all = nil
|
|
keep_all = nil
|
|
end
|
|
if any_domain <= 0 then
|
|
any_domain = nil
|
|
end
|
|
local settings = {
|
|
depth=depth,
|
|
all=all,
|
|
keep_all=keep_all,
|
|
random=random,
|
|
keep_random=keep_random,
|
|
url=url,
|
|
any_domain=any_domain
|
|
}
|
|
url = "custom:"
|
|
for _, k in pairs(
|
|
{"all", "any_domain", "depth", "keep_all", "keep_random", "random", "url"}
|
|
) do
|
|
local v = settings[k]
|
|
if v ~= nil then
|
|
url = url .. k .. "=" .. urlparse.escape(tostring(v)) .. "&"
|
|
end
|
|
end
|
|
url = string.sub(url, 1, -2)
|
|
end
|
|
end
|
|
if not duplicate_urls[url] and not queued_urls[url] then
|
|
if find_path_loop(url, 2) then
|
|
return false
|
|
end
|
|
--print("queuing",original, url)
|
|
queued_urls[url] = true
|
|
end
|
|
end
|
|
|
|
queue_monthly_url = function(url)
|
|
local random_s = os.date("%Y%m", timestamp)
|
|
url = percent_encode_url(url)
|
|
queued_urls["custom:random=" .. random_s .. "&url=" .. urlparse.escape(tostring(url))] = true
|
|
end
|
|
|
|
remove_param = function(url, param_pattern)
|
|
local newurl = url
|
|
repeat
|
|
url = newurl
|
|
newurl = string.gsub(url, "([%?&;])" .. param_pattern .. "=[^%?&;]*[%?&;]?", "%1")
|
|
until newurl == url
|
|
return string.match(newurl, "^(.-)[%?&;]?$")
|
|
end
|
|
|
|
queue_new_urls = function(url)
|
|
if not url then
|
|
return nil
|
|
end
|
|
local newurl = string.gsub(url, "([%?&;])[aA][mM][pP];", "%1")
|
|
if url == current_url then
|
|
if newurl ~= url then
|
|
queue_url(newurl)
|
|
end
|
|
end
|
|
for _, param_pattern in pairs(bad_params) do
|
|
newurl = remove_param(newurl, param_pattern)
|
|
end
|
|
if newurl ~= url then
|
|
queue_url(newurl)
|
|
end
|
|
newurl = string.match(newurl, "^([^%?&]+)")
|
|
if newurl ~= url then
|
|
queue_url(newurl)
|
|
end
|
|
url = string.gsub(url, """, '"')
|
|
url = string.gsub(url, "&", "&")
|
|
for newurl in string.gmatch(url, '([^"\\]+)') do
|
|
if newurl ~= url then
|
|
queue_url(newurl)
|
|
end
|
|
end
|
|
end
|
|
|
|
report_bad_url = function(url)
|
|
if current_url ~= nil then
|
|
bad_urls[current_url] = true
|
|
else
|
|
bad_urls[string.lower(url)] = true
|
|
end
|
|
end
|
|
|
|
strip_url = function(url)
|
|
url = string.match(url, "^https?://(.+)$")
|
|
newurl = string.match(url, "^www%.(.+)$")
|
|
if newurl then
|
|
url = newurl
|
|
end
|
|
return url
|
|
end
|
|
|
|
wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
|
|
local url = urlpos["url"]["url"]
|
|
local parenturl = parent["url"]
|
|
local extract_page_requisites = false
|
|
|
|
local current_settings_all = current_settings and current_settings["all"]
|
|
local current_settings_any_domain = current_settings and current_settings["any_domain"]
|
|
|
|
--queue_monthly_url(string.match(url, "^(https?://[^/]+)") .. "/")
|
|
|
|
if redirect_urls[parenturl] and not (
|
|
status_code == 300 and string.match(parenturl, "^https?://[^/]*feb%-web%.ru/")
|
|
) then
|
|
return true
|
|
end
|
|
|
|
if find_path_loop(url, 2) then
|
|
return false
|
|
end
|
|
|
|
local _, count = string.gsub(url, "[/%?]", "")
|
|
if count >= 16 then
|
|
return false
|
|
end
|
|
|
|
for _, extension in pairs({
|
|
"pdf",
|
|
"doc[mx]?",
|
|
"xls[mx]?",
|
|
"ppt[mx]?",
|
|
"zip",
|
|
"odt",
|
|
"odm",
|
|
"ods",
|
|
"odp",
|
|
"xml",
|
|
"json",
|
|
"torrent"
|
|
}) do
|
|
if string.match(parenturl, "%." .. extension .. "$")
|
|
or string.match(parenturl, "%." .. extension .. "[^a-z0-9A-Z]")
|
|
or string.match(parenturl, "%." .. string.upper(extension) .. "$")
|
|
or string.match(parenturl, "%." .. string.upper(extension) .. "[^a-z0-9A-Z]") then
|
|
return false
|
|
end
|
|
if string.match(url, "%." .. extension .. "$")
|
|
or string.match(url, "%." .. extension .. "[^a-z0-9A-Z]")
|
|
or string.match(url, "%." .. string.upper(extension) .. "$")
|
|
or string.match(url, "%." .. string.upper(extension) .. "[^a-z0-9A-Z]") then
|
|
queue_url(url)
|
|
return false
|
|
end
|
|
end
|
|
|
|
local domain_match = checked_domains[item_first_url]
|
|
if not domain_match then
|
|
domain_match = check_domain_outlinks(item_first_url)
|
|
if not domain_match then
|
|
domain_match = "none"
|
|
end
|
|
checked_domains[item_first_url] = domain_match
|
|
end
|
|
if domain_match ~= "none" then
|
|
extract_page_requisites = true
|
|
local newurl_domain = string.match(url, "^https?://([^/]+)")
|
|
local to_queue = true
|
|
for domain, _ in pairs(redirect_domains) do
|
|
if check_domain_outlinks(url, domain) then
|
|
to_queue = false
|
|
break
|
|
end
|
|
end
|
|
if to_queue then
|
|
queue_url(url)
|
|
return false
|
|
end
|
|
end
|
|
|
|
--[[if not extract_page_requisites then
|
|
return false
|
|
end]]
|
|
|
|
if (status_code < 200 or status_code >= 300 or not verdict)
|
|
and not current_settings_all then
|
|
return false
|
|
end
|
|
|
|
--[[if string.len(url) == string.len(parenturl) then
|
|
local good_url = false
|
|
local index1, index2
|
|
temp_url = string.match(url, "^https?://(.+)$")
|
|
temp_parenturl = string.match(parenturl, "^https?://(.+)$")
|
|
local start_index = 1
|
|
repeat
|
|
index1 = string.find(temp_url, "/", start_index)
|
|
index2 = string.find(temp_parenturl, "/", start_index)
|
|
if index1 ~= index2 then
|
|
good_url = true
|
|
break
|
|
end
|
|
if index1 then
|
|
start_index = index1 + 1
|
|
end
|
|
until not index1 or not index2
|
|
if not good_url then
|
|
return false
|
|
end
|
|
end]]
|
|
|
|
if parenturl_uuid == nil then
|
|
parenturl_uuid = false
|
|
for old_parent_url, _ in pairs(visited_urls) do
|
|
for id_to_ignore, _ in pairs(ids_to_ignore) do
|
|
if string.match(old_parent_url, id_to_ignore) then
|
|
parenturl_uuid = true
|
|
break
|
|
end
|
|
end
|
|
if parenturl_uuid then
|
|
break
|
|
end
|
|
end
|
|
end
|
|
if parenturl_uuid then
|
|
for id_to_ignore, _ in pairs(ids_to_ignore) do
|
|
if string.match(url, id_to_ignore) and not current_settings_all then
|
|
return false
|
|
end
|
|
end
|
|
end
|
|
|
|
if urlpos["link_refresh_p"] ~= 0 then
|
|
queue_url(url)
|
|
return false
|
|
end
|
|
|
|
if parenturl_requisite == nil then
|
|
parenturl_requisite = false
|
|
for _, pattern in pairs(page_requisite_patterns) do
|
|
for old_parent_url, _ in pairs(visited_urls) do
|
|
if string.match(old_parent_url, pattern) then
|
|
parenturl_requisite = true
|
|
break
|
|
end
|
|
end
|
|
if parenturl_requisite then
|
|
break
|
|
end
|
|
end
|
|
end
|
|
if parenturl_requisite and not current_settings_all then
|
|
return false
|
|
end
|
|
|
|
if urlpos["link_inline_p"] ~= 0 then
|
|
queue_url(url)
|
|
return false
|
|
end
|
|
|
|
local current_host = string.match(urlpos["url"]["host"], "([^%.]+%.[^%.]+)$")
|
|
local first_parent_host = string.match(parent["host"], "([^%.]+%.[^%.]+)$")
|
|
|
|
if current_url then
|
|
first_parent_host = string.match(current_url .. "/", "^https?://[^/]-([^/%.]+%.[^/%.]+)/")
|
|
end
|
|
|
|
if current_settings_all and (
|
|
current_settings_any_domain
|
|
or first_parent_host == current_host
|
|
) then
|
|
queue_url(url, true)
|
|
return false
|
|
end
|
|
|
|
--[[for old_parent_url, _ in pairs(visited_urls) do
|
|
for _, pattern in pairs(page_requisite_patterns) do
|
|
if string.match(old_parent_url, pattern) then
|
|
return false
|
|
end
|
|
end
|
|
end
|
|
|
|
for _, pattern in pairs(page_requisite_patterns) do
|
|
if string.match(url, pattern) then
|
|
queue_url(url)
|
|
return false
|
|
end
|
|
end]]
|
|
end
|
|
|
|
wget.callbacks.get_urls = function(file, url, is_css, iri)
|
|
local html = nil
|
|
|
|
if url then
|
|
downloaded[url] = true
|
|
end
|
|
|
|
local function check(url, headers)
|
|
local url = string.match(url, "^([^#]+)")
|
|
url = string.gsub(url, "&", "&")
|
|
queue_url(url)
|
|
end
|
|
|
|
local function checknewurl(newurl, headers)
|
|
if string.match(newurl, "^#") then
|
|
return nil
|
|
end
|
|
if string.match(newurl, "\\[uU]002[fF]") then
|
|
return checknewurl(string.gsub(newurl, "\\[uU]002[fF]", "/"), headers)
|
|
end
|
|
if string.match(newurl, "^https?:////") then
|
|
check(string.gsub(newurl, ":////", "://"), headers)
|
|
elseif string.match(newurl, "^https?://") then
|
|
check(newurl, headers)
|
|
elseif string.match(newurl, "^https?:\\/\\?/") then
|
|
check(string.gsub(newurl, "\\", ""), headers)
|
|
elseif not url then
|
|
return nil
|
|
elseif string.match(newurl, "^\\/") then
|
|
checknewurl(string.gsub(newurl, "\\", ""), headers)
|
|
elseif string.match(newurl, "^//") then
|
|
check(urlparse.absolute(url, newurl), headers)
|
|
elseif string.match(newurl, "^/") then
|
|
check(urlparse.absolute(url, newurl), headers)
|
|
elseif string.match(newurl, "^%.%./") then
|
|
if string.match(url, "^https?://[^/]+/[^/]+/") then
|
|
check(urlparse.absolute(url, newurl), headers)
|
|
else
|
|
checknewurl(string.match(newurl, "^%.%.(/.+)$"), headers)
|
|
end
|
|
elseif string.match(newurl, "^%./") then
|
|
check(urlparse.absolute(url, newurl), headers)
|
|
end
|
|
end
|
|
|
|
local function checknewshorturl(newurl, headers)
|
|
if string.match(newurl, "^#") then
|
|
return nil
|
|
end
|
|
if url and string.match(newurl, "^%?") then
|
|
check(urlparse.absolute(url, newurl), headers)
|
|
elseif url and not (string.match(newurl, "^https?:\\?/\\?//?/?")
|
|
or string.match(newurl, "^[/\\]")
|
|
or string.match(newurl, "^%./")
|
|
or string.match(newurl, "^[jJ]ava[sS]cript:")
|
|
or string.match(newurl, "^[mM]ail[tT]o:")
|
|
or string.match(newurl, "^vine:")
|
|
or string.match(newurl, "^android%-app:")
|
|
or string.match(newurl, "^ios%-app:")
|
|
or string.match(newurl, "^%${")) then
|
|
check(urlparse.absolute(url, newurl), headers)
|
|
else
|
|
checknewurl(newurl, headers)
|
|
end
|
|
end
|
|
|
|
if (status_code == 200 and current_settings and current_settings["deep_extract"])
|
|
or not url then
|
|
html = read_file(file)
|
|
if not url then
|
|
html = string.gsub(html, " ", " ")
|
|
html = string.gsub(html, "<", "<")
|
|
html = string.gsub(html, ">", ">")
|
|
html = string.gsub(html, """, '"')
|
|
html = string.gsub(html, "'", "'")
|
|
html = string.gsub(html, "&#(%d+);",
|
|
function(n)
|
|
return string.char(n)
|
|
end
|
|
)
|
|
html = string.gsub(html, "&#x(%d+);",
|
|
function(n)
|
|
return string.char(tonumber(n, 16))
|
|
end
|
|
)
|
|
local temp_html = string.gsub(html, "\n", "")
|
|
for _, remove in pairs({"", "<br/>", "</?p[^>]*>"}) do
|
|
if remove ~= "" then
|
|
temp_html = string.gsub(temp_html, remove, "")
|
|
end
|
|
for newurl in string.gmatch(temp_html, "(https?://[^%s<>#\"'\\`{})%]]+)") do
|
|
while string.match(newurl, "[%.&,!;]$") do
|
|
newurl = string.match(newurl, "^(.+).$")
|
|
end
|
|
check(newurl)
|
|
end
|
|
end
|
|
end
|
|
for newurl in string.gmatch(html, "[^%-][hH][rR][eE][fF]='([^']+)'") do
|
|
checknewshorturl(newurl)
|
|
end
|
|
for newurl in string.gmatch(html, '[^%-][hH][rR][eE][fF]="([^"]+)"') do
|
|
checknewshorturl(newurl)
|
|
end
|
|
for newurl in string.gmatch(string.gsub(html, "&[qQ][uU][oO][tT];", '"'), '"(https?://[^"]+)') do
|
|
checknewurl(newurl)
|
|
end
|
|
for newurl in string.gmatch(string.gsub(html, "'", "'"), "'(https?://[^']+)") do
|
|
checknewurl(newurl)
|
|
end
|
|
if url then
|
|
for newurl in string.gmatch(html, ">%s*([^<%s]+)") do
|
|
checknewurl(newurl)
|
|
end
|
|
end
|
|
--[[for newurl in string.gmatch(html, "%(([^%)]+)%)") do
|
|
checknewurl(newurl)
|
|
end]]
|
|
elseif string.match(url, "^https?://[^/]+/.*[^a-z0-9A-Z][pP][dD][fF]$")
|
|
or string.match(url, "^https?://[^/]+/.*[^a-z0-9A-Z][pP][dD][fF][^a-z0-9A-Z]")
|
|
or string.match(read_file(file, 4), "%%[pP][dD][fF]") then
|
|
io.stdout:write("Extracting links from PDF.\n")
|
|
io.stdout:flush()
|
|
local temp_file = file .. "-html.html"
|
|
local check_file = io.open(temp_file)
|
|
if check_file then
|
|
check_file:close()
|
|
os.remove(temp_file)
|
|
end
|
|
os.execute("pdftohtml -nodrm -hidden -i -s -q " .. file)
|
|
check_file = io.open(temp_file)
|
|
if check_file then
|
|
check_file:close()
|
|
local temp_length = table_length(queued_urls)
|
|
wget.callbacks.get_urls(temp_file, nil, nil, nil)
|
|
io.stdout:write("Found " .. tostring(table_length(queued_urls)-temp_length) .. " URLs.\n")
|
|
io.stdout:flush()
|
|
os.remove(temp_file)
|
|
else
|
|
io.stdout:write("Not a PDF.\n")
|
|
io.stdout:flush()
|
|
end
|
|
end
|
|
end
|
|
|
|
wget.callbacks.write_to_warc = function(url, http_stat)
|
|
local url_lower = string.lower(url["url"])
|
|
if urls[url_lower] then
|
|
current_url = url_lower
|
|
current_settings = urls_settings[url_lower]
|
|
end
|
|
if current_settings and not current_settings["random"] then
|
|
queue_url(url["url"])
|
|
return false
|
|
end
|
|
if bad_code(http_stat["statcode"]) then
|
|
return false
|
|
elseif http_stat["statcode"] >= 300 and http_stat["statcode"] <= 399 then
|
|
local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
|
|
if string.match(newloc, "^https?://[^/]*google%.com/sorry")
|
|
or string.match(newloc, "^https?://[^/]*google%.com/[sS]ervice[lL]ogin")
|
|
or string.match(newloc, "^https?://consent%.youtube%.com/")
|
|
or string.match(newloc, "^https?://consent%.google%.com/")
|
|
or string.match(newloc, "^https?://misuse%.ncbi%.nlm%.nih%.gov/")
|
|
or string.match(newloc, "^https?://myprivacy%.dpgmedia%.nl/")
|
|
or string.match(newloc, "^https?://idp%.springer%.com/authorize%?")
|
|
or string.match(newloc, "^https?://[^/]*instagram%.com/accounts/") then
|
|
report_bad_url(url["url"])
|
|
exit_url = true
|
|
return false
|
|
end
|
|
return true
|
|
elseif http_stat["statcode"] ~= 200 then
|
|
return true
|
|
end
|
|
if true then
|
|
return true
|
|
end
|
|
if http_stat["len"] > min_dedup_mb * 1024 * 1024 then
|
|
io.stdout:write("Data larger than " .. tostring(min_dedup_mb) .. " MB. Checking with Wayback Machine.\n")
|
|
io.stdout:flush()
|
|
while true do
|
|
local body, code, headers, status = http.request(
|
|
"https://web.archive.org/__wb/calendarcaptures/2"
|
|
.. "?url=" .. urlparse.escape(url["url"])
|
|
.. "&date=202"
|
|
)
|
|
if code ~= 200 then
|
|
io.stdout:write("Got " .. tostring(code) .. " from the Wayback Machine.\n")
|
|
io.stdout:flush()
|
|
os.execute("sleep 10")
|
|
else
|
|
data = JSON:decode(body)
|
|
if not data["items"] or not data["colls"] then
|
|
return true
|
|
end
|
|
for _, item in pairs(data["items"]) do
|
|
if item[2] == 200 then
|
|
local coll_id = item[3] + 1
|
|
if not coll_id then
|
|
io.stdout:write("Could get coll ID.\n")
|
|
io.stdout:flush()
|
|
end
|
|
local collections = data["colls"][coll_id]
|
|
if not collections then
|
|
io.stdout:write("Could not get collections.\n")
|
|
io.stdout:flush()
|
|
end
|
|
for _, collection in pairs(collections) do
|
|
if collection == "archivebot"
|
|
or string.find(collection, "archiveteam") then
|
|
io.stdout:write("Archive Team got this URL before.\n")
|
|
return false
|
|
end
|
|
end
|
|
end
|
|
end
|
|
break
|
|
end
|
|
end
|
|
end
|
|
return true
|
|
end
|
|
|
|
wget.callbacks.httploop_result = function(url, err, http_stat)
|
|
status_code = http_stat["statcode"]
|
|
|
|
parenturl_uuid = nil
|
|
parenturl_requisite = nil
|
|
|
|
local url_lower = string.lower(url["url"])
|
|
if urls[url_lower] then
|
|
current_url = url_lower
|
|
current_settings = urls_settings[url_lower]
|
|
end
|
|
|
|
if not timestamp then
|
|
local body, code, headers, status = http.request("https://legacy-api.arpa.li/now")
|
|
assert(code == 200)
|
|
timestamp = tonumber(string.match(body, "^([0-9]+)"))
|
|
end
|
|
|
|
|
|
if status_code ~= 0 then
|
|
local base_url = string.match(url["url"], "^(https://[^/]+)")
|
|
if base_url then
|
|
for _, newurl in pairs({
|
|
base_url .. "/robots.txt",
|
|
base_url .. "/favicon.ico",
|
|
base_url .. "/"
|
|
}) do
|
|
queue_monthly_url(newurl)
|
|
end
|
|
end
|
|
end
|
|
|
|
url_count = url_count + 1
|
|
io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. " \n")
|
|
io.stdout:flush()
|
|
|
|
if redirect_domains["done"] then
|
|
redirect_domains = {}
|
|
redirect_urls = {}
|
|
visited_urls = {}
|
|
item_first_url = nil
|
|
end
|
|
redirect_domains[string.match(url["url"], "^https?://([^/]+)")] = true
|
|
if not item_first_url then
|
|
item_first_url = url["url"]
|
|
end
|
|
|
|
visited_urls[url["url"]] = true
|
|
|
|
if exit_url then
|
|
exit_url = false
|
|
return wget.actions.EXIT
|
|
end
|
|
|
|
if status_code >= 300 and status_code <= 399 then
|
|
local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
|
|
redirect_urls[url["url"]] = true
|
|
--[[if strip_url(url["url"]) == strip_url(newloc) then
|
|
queued_urls[newloc] = true
|
|
return wget.actions.EXIT
|
|
end]]
|
|
if downloaded[newloc] then
|
|
return wget.actions.EXIT
|
|
elseif string.match(url["url"], "^https?://[^/]*telegram%.org/dl%?tme=")
|
|
or (
|
|
string.match(newloc, "^https?://www%.(.+)")
|
|
or string.match(newloc, "^https?://(.+)")
|
|
) == (
|
|
string.match(url["url"], "^https?://www%.(.+)")
|
|
or string.match(url["url"], "^https?://(.+)")
|
|
)
|
|
or status_code == 301
|
|
or status_code == 308 then
|
|
queue_url(newloc)
|
|
return wget.actions.EXIT
|
|
end
|
|
else
|
|
redirect_domains["done"] = true
|
|
end
|
|
|
|
if downloaded[url["url"]] then
|
|
report_bad_url(url["url"])
|
|
return wget.actions.EXIT
|
|
end
|
|
|
|
for _, pattern in pairs(ignore_patterns) do
|
|
if string.match(url["url"], pattern) then
|
|
return wget.actions.EXIT
|
|
end
|
|
end
|
|
|
|
if status_code >= 200 and status_code <= 399 then
|
|
downloaded[url["url"]] = true
|
|
end
|
|
|
|
if status_code >= 200 and status_code < 300 then
|
|
queue_new_urls(url["url"])
|
|
end
|
|
|
|
if bad_code(status_code) then
|
|
io.stdout:write("Server returned " .. http_stat.statcode .. " (" .. err .. ").\n")
|
|
io.stdout:flush()
|
|
report_bad_url(url["url"])
|
|
return wget.actions.EXIT
|
|
end
|
|
|
|
local sleep_time = 0
|
|
|
|
if sleep_time > 0.001 then
|
|
os.execute("sleep " .. sleep_time)
|
|
end
|
|
|
|
return wget.actions.NOTHING
|
|
end
|
|
|
|
wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time)
|
|
local function submit_backfeed(newurls)
|
|
local tries = 0
|
|
local maxtries = 4
|
|
while tries < maxtries do
|
|
local body, code, headers, status = http.request(
|
|
"https://legacy-api.arpa.li/backfeed/legacy/urls-glx7ansh4e17aii",
|
|
newurls .. "\0"
|
|
)
|
|
print(body)
|
|
if code == 200 then
|
|
io.stdout:write("Submitted discovered URLs.\n")
|
|
io.stdout:flush()
|
|
break
|
|
end
|
|
io.stdout:write("Failed to submit discovered URLs." .. tostring(code) .. tostring(body) .. "\n")
|
|
io.stdout:flush()
|
|
os.execute("sleep " .. math.floor(math.pow(2, tries)))
|
|
tries = tries + 1
|
|
end
|
|
if tries == maxtries then
|
|
abortgrab = true
|
|
end
|
|
end
|
|
|
|
local newurls = nil
|
|
local is_bad = false
|
|
local count = 0
|
|
local dup_urls = io.open(item_dir .. "/" .. warc_file_base .. "_duplicate-urls.txt", "w")
|
|
for url, _ in pairs(queued_urls) do
|
|
for _, pattern in pairs(bad_patterns) do
|
|
is_bad = string.match(url, pattern)
|
|
if is_bad then
|
|
io.stdout:write("Filtering out URL " .. url .. ".\n")
|
|
io.stdout:flush()
|
|
break
|
|
end
|
|
end
|
|
if not is_bad then
|
|
io.stdout:write("Queuing URL " .. url .. ".\n")
|
|
io.stdout:flush()
|
|
dup_urls:write(url .. "\n")
|
|
if newurls == nil then
|
|
newurls = url
|
|
else
|
|
newurls = newurls .. "\0" .. url
|
|
end
|
|
count = count + 1
|
|
if count == 100 then
|
|
submit_backfeed(newurls)
|
|
newurls = nil
|
|
count = 0
|
|
end
|
|
end
|
|
end
|
|
if newurls ~= nil then
|
|
submit_backfeed(newurls)
|
|
end
|
|
dup_urls:close()
|
|
|
|
local file = io.open(item_dir .. "/" .. warc_file_base .. "_bad-urls.txt", "w")
|
|
for url, _ in pairs(bad_urls) do
|
|
file:write(url .. "\n")
|
|
end
|
|
file:close()
|
|
end
|
|
|
|
wget.callbacks.before_exit = function(exit_status, exit_status_string)
|
|
if abortgrab then
|
|
return wget.exits.IO_FAIL
|
|
end
|
|
return exit_status
|
|
end
|
|
|