commit b5b36df7459a98ac75645b40dcfd1760a5b5b94f Author: TheTechRobo <52163910+TheTechRobo@users.noreply.github.com> Date: Tue May 17 21:36:37 2022 -0400 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..19db297 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*~ +*.pyc +wget-lua +wget-at +STOP +BANNED +data/ +test/ +duplicate-urls.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2090d27 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,3 @@ +FROM atdr.meo.ws/archiveteam/grab-base +COPY . /grab +RUN ln -fs /usr/local/bin/wget-lua /grab/wget-at diff --git a/JSON.lua b/JSON.lua new file mode 100644 index 0000000..5f11425 --- /dev/null +++ b/JSON.lua @@ -0,0 +1,1053 @@ +-- -*- coding: utf-8 -*- +-- +-- Simple JSON encoding and decoding in pure Lua. +-- +-- Copyright 2010-2014 Jeffrey Friedl +-- http://regex.info/blog/ +-- +-- Latest version: http://regex.info/blog/lua/json +-- +-- This code is released under a Creative Commons CC-BY "Attribution" License: +-- http://creativecommons.org/licenses/by/3.0/deed.en_US +-- +-- It can be used for any purpose so long as the copyright notice above, +-- the web-page links above, and the 'AUTHOR_NOTE' string below are +-- maintained. Enjoy. +-- +local VERSION = 20141223.14 -- version history at end of file +local AUTHOR_NOTE = "-[ JSON.lua package by Jeffrey Friedl (http://regex.info/blog/lua/json) version 20141223.14 ]-" + +-- +-- The 'AUTHOR_NOTE' variable exists so that information about the source +-- of the package is maintained even in compiled versions. It's also +-- included in OBJDEF below mostly to quiet warnings about unused variables. +-- +local OBJDEF = { + VERSION = VERSION, + AUTHOR_NOTE = AUTHOR_NOTE, +} + + +-- +-- Simple JSON encoding and decoding in pure Lua. +-- http://www.json.org/ +-- +-- +-- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines +-- +-- local lua_value = JSON:decode(raw_json_text) +-- +-- local raw_json_text = JSON:encode(lua_table_or_value) +-- local pretty_json_text = JSON:encode_pretty(lua_table_or_value) -- "pretty printed" version for human readability +-- +-- +-- +-- DECODING (from a JSON string to a Lua table) +-- +-- +-- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines +-- +-- local lua_value = JSON:decode(raw_json_text) +-- +-- If the JSON text is for an object or an array, e.g. +-- { "what": "books", "count": 3 } +-- or +-- [ "Larry", "Curly", "Moe" ] +-- +-- the result is a Lua table, e.g. +-- { what = "books", count = 3 } +-- or +-- { "Larry", "Curly", "Moe" } +-- +-- +-- The encode and decode routines accept an optional second argument, +-- "etc", which is not used during encoding or decoding, but upon error +-- is passed along to error handlers. It can be of any type (including nil). +-- +-- +-- +-- ERROR HANDLING +-- +-- With most errors during decoding, this code calls +-- +-- JSON:onDecodeError(message, text, location, etc) +-- +-- with a message about the error, and if known, the JSON text being +-- parsed and the byte count where the problem was discovered. You can +-- replace the default JSON:onDecodeError() with your own function. +-- +-- The default onDecodeError() merely augments the message with data +-- about the text and the location if known (and if a second 'etc' +-- argument had been provided to decode(), its value is tacked onto the +-- message as well), and then calls JSON.assert(), which itself defaults +-- to Lua's built-in assert(), and can also be overridden. +-- +-- For example, in an Adobe Lightroom plugin, you might use something like +-- +-- function JSON:onDecodeError(message, text, location, etc) +-- LrErrors.throwUserError("Internal Error: invalid JSON data") +-- end +-- +-- or even just +-- +-- function JSON.assert(message) +-- LrErrors.throwUserError("Internal Error: " .. message) +-- end +-- +-- If JSON:decode() is passed a nil, this is called instead: +-- +-- JSON:onDecodeOfNilError(message, nil, nil, etc) +-- +-- and if JSON:decode() is passed HTML instead of JSON, this is called: +-- +-- JSON:onDecodeOfHTMLError(message, text, nil, etc) +-- +-- The use of the fourth 'etc' argument allows stronger coordination +-- between decoding and error reporting, especially when you provide your +-- own error-handling routines. Continuing with the the Adobe Lightroom +-- plugin example: +-- +-- function JSON:onDecodeError(message, text, location, etc) +-- local note = "Internal Error: invalid JSON data" +-- if type(etc) = 'table' and etc.photo then +-- note = note .. " while processing for " .. etc.photo:getFormattedMetadata('fileName') +-- end +-- LrErrors.throwUserError(note) +-- end +-- +-- : +-- : +-- +-- for i, photo in ipairs(photosToProcess) do +-- : +-- : +-- local data = JSON:decode(someJsonText, { photo = photo }) +-- : +-- : +-- end +-- +-- +-- +-- +-- +-- DECODING AND STRICT TYPES +-- +-- Because both JSON objects and JSON arrays are converted to Lua tables, +-- it's not normally possible to tell which original JSON type a +-- particular Lua table was derived from, or guarantee decode-encode +-- round-trip equivalency. +-- +-- However, if you enable strictTypes, e.g. +-- +-- JSON = assert(loadfile "JSON.lua")() --load the routines +-- JSON.strictTypes = true +-- +-- then the Lua table resulting from the decoding of a JSON object or +-- JSON array is marked via Lua metatable, so that when re-encoded with +-- JSON:encode() it ends up as the appropriate JSON type. +-- +-- (This is not the default because other routines may not work well with +-- tables that have a metatable set, for example, Lightroom API calls.) +-- +-- +-- ENCODING (from a lua table to a JSON string) +-- +-- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines +-- +-- local raw_json_text = JSON:encode(lua_table_or_value) +-- local pretty_json_text = JSON:encode_pretty(lua_table_or_value) -- "pretty printed" version for human readability +-- local custom_pretty = JSON:encode(lua_table_or_value, etc, { pretty = true, indent = "| ", align_keys = false }) +-- +-- On error during encoding, this code calls: +-- +-- JSON:onEncodeError(message, etc) +-- +-- which you can override in your local JSON object. +-- +-- The 'etc' in the error call is the second argument to encode() +-- and encode_pretty(), or nil if it wasn't provided. +-- +-- +-- PRETTY-PRINTING +-- +-- An optional third argument, a table of options, allows a bit of +-- configuration about how the encoding takes place: +-- +-- pretty = JSON:encode(val, etc, { +-- pretty = true, -- if false, no other options matter +-- indent = " ", -- this provides for a three-space indent per nesting level +-- align_keys = false, -- see below +-- }) +-- +-- encode() and encode_pretty() are identical except that encode_pretty() +-- provides a default options table if none given in the call: +-- +-- { pretty = true, align_keys = false, indent = " " } +-- +-- For example, if +-- +-- JSON:encode(data) +-- +-- produces: +-- +-- {"city":"Kyoto","climate":{"avg_temp":16,"humidity":"high","snowfall":"minimal"},"country":"Japan","wards":11} +-- +-- then +-- +-- JSON:encode_pretty(data) +-- +-- produces: +-- +-- { +-- "city": "Kyoto", +-- "climate": { +-- "avg_temp": 16, +-- "humidity": "high", +-- "snowfall": "minimal" +-- }, +-- "country": "Japan", +-- "wards": 11 +-- } +-- +-- The following three lines return identical results: +-- JSON:encode_pretty(data) +-- JSON:encode_pretty(data, nil, { pretty = true, align_keys = false, indent = " " }) +-- JSON:encode (data, nil, { pretty = true, align_keys = false, indent = " " }) +-- +-- An example of setting your own indent string: +-- +-- JSON:encode_pretty(data, nil, { pretty = true, indent = "| " }) +-- +-- produces: +-- +-- { +-- | "city": "Kyoto", +-- | "climate": { +-- | | "avg_temp": 16, +-- | | "humidity": "high", +-- | | "snowfall": "minimal" +-- | }, +-- | "country": "Japan", +-- | "wards": 11 +-- } +-- +-- An example of setting align_keys to true: +-- +-- JSON:encode_pretty(data, nil, { pretty = true, indent = " ", align_keys = true }) +-- +-- produces: +-- +-- { +-- "city": "Kyoto", +-- "climate": { +-- "avg_temp": 16, +-- "humidity": "high", +-- "snowfall": "minimal" +-- }, +-- "country": "Japan", +-- "wards": 11 +-- } +-- +-- which I must admit is kinda ugly, sorry. This was the default for +-- encode_pretty() prior to version 20141223.14. +-- +-- +-- AMBIGUOUS SITUATIONS DURING THE ENCODING +-- +-- During the encode, if a Lua table being encoded contains both string +-- and numeric keys, it fits neither JSON's idea of an object, nor its +-- idea of an array. To get around this, when any string key exists (or +-- when non-positive numeric keys exist), numeric keys are converted to +-- strings. +-- +-- For example, +-- JSON:encode({ "one", "two", "three", SOMESTRING = "some string" })) +-- produces the JSON object +-- {"1":"one","2":"two","3":"three","SOMESTRING":"some string"} +-- +-- To prohibit this conversion and instead make it an error condition, set +-- JSON.noKeyConversion = true +-- + + + + +-- +-- SUMMARY OF METHODS YOU CAN OVERRIDE IN YOUR LOCAL LUA JSON OBJECT +-- +-- assert +-- onDecodeError +-- onDecodeOfNilError +-- onDecodeOfHTMLError +-- onEncodeError +-- +-- If you want to create a separate Lua JSON object with its own error handlers, +-- you can reload JSON.lua or use the :new() method. +-- +--------------------------------------------------------------------------- + +local default_pretty_indent = " " +local default_pretty_options = { pretty = true, align_keys = false, indent = default_pretty_indent } + +local isArray = { __tostring = function() return "JSON array" end } isArray.__index = isArray +local isObject = { __tostring = function() return "JSON object" end } isObject.__index = isObject + + +function OBJDEF:newArray(tbl) + return setmetatable(tbl or {}, isArray) +end + +function OBJDEF:newObject(tbl) + return setmetatable(tbl or {}, isObject) +end + +local function unicode_codepoint_as_utf8(codepoint) + -- + -- codepoint is a number + -- + if codepoint <= 127 then + return string.char(codepoint) + + elseif codepoint <= 2047 then + -- + -- 110yyyxx 10xxxxxx <-- useful notation from http://en.wikipedia.org/wiki/Utf8 + -- + local highpart = math.floor(codepoint / 0x40) + local lowpart = codepoint - (0x40 * highpart) + return string.char(0xC0 + highpart, + 0x80 + lowpart) + + elseif codepoint <= 65535 then + -- + -- 1110yyyy 10yyyyxx 10xxxxxx + -- + local highpart = math.floor(codepoint / 0x1000) + local remainder = codepoint - 0x1000 * highpart + local midpart = math.floor(remainder / 0x40) + local lowpart = remainder - 0x40 * midpart + + highpart = 0xE0 + highpart + midpart = 0x80 + midpart + lowpart = 0x80 + lowpart + + -- + -- Check for an invalid character (thanks Andy R. at Adobe). + -- See table 3.7, page 93, in http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#G28070 + -- + if ( highpart == 0xE0 and midpart < 0xA0 ) or + ( highpart == 0xED and midpart > 0x9F ) or + ( highpart == 0xF0 and midpart < 0x90 ) or + ( highpart == 0xF4 and midpart > 0x8F ) + then + return "?" + else + return string.char(highpart, + midpart, + lowpart) + end + + else + -- + -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + -- + local highpart = math.floor(codepoint / 0x40000) + local remainder = codepoint - 0x40000 * highpart + local midA = math.floor(remainder / 0x1000) + remainder = remainder - 0x1000 * midA + local midB = math.floor(remainder / 0x40) + local lowpart = remainder - 0x40 * midB + + return string.char(0xF0 + highpart, + 0x80 + midA, + 0x80 + midB, + 0x80 + lowpart) + end +end + +function OBJDEF:onDecodeError(message, text, location, etc) + if text then + if location then + message = string.format("%s at char %d of: %s", message, location, text) + else + message = string.format("%s: %s", message, text) + end + end + + if etc ~= nil then + message = message .. " (" .. OBJDEF:encode(etc) .. ")" + end + + if self.assert then + self.assert(false, message) + else + assert(false, message) + end +end + +OBJDEF.onDecodeOfNilError = OBJDEF.onDecodeError +OBJDEF.onDecodeOfHTMLError = OBJDEF.onDecodeError + +function OBJDEF:onEncodeError(message, etc) + if etc ~= nil then + message = message .. " (" .. OBJDEF:encode(etc) .. ")" + end + + if self.assert then + self.assert(false, message) + else + assert(false, message) + end +end + +local function grok_number(self, text, start, etc) + -- + -- Grab the integer part + -- + local integer_part = text:match('^-?[1-9]%d*', start) + or text:match("^-?0", start) + + if not integer_part then + self:onDecodeError("expected number", text, start, etc) + end + + local i = start + integer_part:len() + + -- + -- Grab an optional decimal part + -- + local decimal_part = text:match('^%.%d+', i) or "" + + i = i + decimal_part:len() + + -- + -- Grab an optional exponential part + -- + local exponent_part = text:match('^[eE][-+]?%d+', i) or "" + + i = i + exponent_part:len() + + local full_number_text = integer_part .. decimal_part .. exponent_part + local as_number = tonumber(full_number_text) + + if not as_number then + self:onDecodeError("bad number", text, start, etc) + end + + return as_number, i +end + + +local function grok_string(self, text, start, etc) + + if text:sub(start,start) ~= '"' then + self:onDecodeError("expected string's opening quote", text, start, etc) + end + + local i = start + 1 -- +1 to bypass the initial quote + local text_len = text:len() + local VALUE = "" + while i <= text_len do + local c = text:sub(i,i) + if c == '"' then + return VALUE, i + 1 + end + if c ~= '\\' then + VALUE = VALUE .. c + i = i + 1 + elseif text:match('^\\b', i) then + VALUE = VALUE .. "\b" + i = i + 2 + elseif text:match('^\\f', i) then + VALUE = VALUE .. "\f" + i = i + 2 + elseif text:match('^\\n', i) then + VALUE = VALUE .. "\n" + i = i + 2 + elseif text:match('^\\r', i) then + VALUE = VALUE .. "\r" + i = i + 2 + elseif text:match('^\\t', i) then + VALUE = VALUE .. "\t" + i = i + 2 + else + local hex = text:match('^\\u([0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF])', i) + if hex then + i = i + 6 -- bypass what we just read + + -- We have a Unicode codepoint. It could be standalone, or if in the proper range and + -- followed by another in a specific range, it'll be a two-code surrogate pair. + local codepoint = tonumber(hex, 16) + if codepoint >= 0xD800 and codepoint <= 0xDBFF then + -- it's a hi surrogate... see whether we have a following low + local lo_surrogate = text:match('^\\u([dD][cdefCDEF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF])', i) + if lo_surrogate then + i = i + 6 -- bypass the low surrogate we just read + codepoint = 0x2400 + (codepoint - 0xD800) * 0x400 + tonumber(lo_surrogate, 16) + else + -- not a proper low, so we'll just leave the first codepoint as is and spit it out. + end + end + VALUE = VALUE .. unicode_codepoint_as_utf8(codepoint) + + else + + -- just pass through what's escaped + VALUE = VALUE .. text:match('^\\(.)', i) + i = i + 2 + end + end + end + + self:onDecodeError("unclosed string", text, start, etc) +end + +local function skip_whitespace(text, start) + + local _, match_end = text:find("^[ \n\r\t]+", start) -- [http://www.ietf.org/rfc/rfc4627.txt] Section 2 + if match_end then + return match_end + 1 + else + return start + end +end + +local grok_one -- assigned later + +local function grok_object(self, text, start, etc) + if text:sub(start,start) ~= '{' then + self:onDecodeError("expected '{'", text, start, etc) + end + + local i = skip_whitespace(text, start + 1) -- +1 to skip the '{' + + local VALUE = self.strictTypes and self:newObject { } or { } + + if text:sub(i,i) == '}' then + return VALUE, i + 1 + end + local text_len = text:len() + while i <= text_len do + local key, new_i = grok_string(self, text, i, etc) + + i = skip_whitespace(text, new_i) + + if text:sub(i, i) ~= ':' then + self:onDecodeError("expected colon", text, i, etc) + end + + i = skip_whitespace(text, i + 1) + + local new_val, new_i = grok_one(self, text, i) + + VALUE[key] = new_val + + -- + -- Expect now either '}' to end things, or a ',' to allow us to continue. + -- + i = skip_whitespace(text, new_i) + + local c = text:sub(i,i) + + if c == '}' then + return VALUE, i + 1 + end + + if text:sub(i, i) ~= ',' then + self:onDecodeError("expected comma or '}'", text, i, etc) + end + + i = skip_whitespace(text, i + 1) + end + + self:onDecodeError("unclosed '{'", text, start, etc) +end + +local function grok_array(self, text, start, etc) + if text:sub(start,start) ~= '[' then + self:onDecodeError("expected '['", text, start, etc) + end + + local i = skip_whitespace(text, start + 1) -- +1 to skip the '[' + local VALUE = self.strictTypes and self:newArray { } or { } + if text:sub(i,i) == ']' then + return VALUE, i + 1 + end + + local VALUE_INDEX = 1 + + local text_len = text:len() + while i <= text_len do + local val, new_i = grok_one(self, text, i) + + -- can't table.insert(VALUE, val) here because it's a no-op if val is nil + VALUE[VALUE_INDEX] = val + VALUE_INDEX = VALUE_INDEX + 1 + + i = skip_whitespace(text, new_i) + + -- + -- Expect now either ']' to end things, or a ',' to allow us to continue. + -- + local c = text:sub(i,i) + if c == ']' then + return VALUE, i + 1 + end + if text:sub(i, i) ~= ',' then + self:onDecodeError("expected comma or '['", text, i, etc) + end + i = skip_whitespace(text, i + 1) + end + self:onDecodeError("unclosed '['", text, start, etc) +end + + +grok_one = function(self, text, start, etc) + -- Skip any whitespace + start = skip_whitespace(text, start) + + if start > text:len() then + self:onDecodeError("unexpected end of string", text, nil, etc) + end + + if text:find('^"', start) then + return grok_string(self, text, start, etc) + + elseif text:find('^[-0123456789 ]', start) then + return grok_number(self, text, start, etc) + + elseif text:find('^%{', start) then + return grok_object(self, text, start, etc) + + elseif text:find('^%[', start) then + return grok_array(self, text, start, etc) + + elseif text:find('^true', start) then + return true, start + 4 + + elseif text:find('^false', start) then + return false, start + 5 + + elseif text:find('^null', start) then + return nil, start + 4 + + else + self:onDecodeError("can't parse JSON", text, start, etc) + end +end + +function OBJDEF:decode(text, etc) + if type(self) ~= 'table' or self.__index ~= OBJDEF then + OBJDEF:onDecodeError("JSON:decode must be called in method format", nil, nil, etc) + end + + if text == nil then + self:onDecodeOfNilError(string.format("nil passed to JSON:decode()"), nil, nil, etc) + elseif type(text) ~= 'string' then + self:onDecodeError(string.format("expected string argument to JSON:decode(), got %s", type(text)), nil, nil, etc) + end + + if text:match('^%s*$') then + return nil + end + + if text:match('^%s*<') then + -- Can't be JSON... we'll assume it's HTML + self:onDecodeOfHTMLError(string.format("html passed to JSON:decode()"), text, nil, etc) + end + + -- + -- Ensure that it's not UTF-32 or UTF-16. + -- Those are perfectly valid encodings for JSON (as per RFC 4627 section 3), + -- but this package can't handle them. + -- + if text:sub(1,1):byte() == 0 or (text:len() >= 2 and text:sub(2,2):byte() == 0) then + self:onDecodeError("JSON package groks only UTF-8, sorry", text, nil, etc) + end + + local success, value = pcall(grok_one, self, text, 1, etc) + + if success then + return value + else + -- if JSON:onDecodeError() didn't abort out of the pcall, we'll have received the error message here as "value", so pass it along as an assert. + if self.assert then + self.assert(false, value) + else + assert(false, value) + end + -- and if we're still here, return a nil and throw the error message on as a second arg + return nil, value + end +end + +local function backslash_replacement_function(c) + if c == "\n" then + return "\\n" + elseif c == "\r" then + return "\\r" + elseif c == "\t" then + return "\\t" + elseif c == "\b" then + return "\\b" + elseif c == "\f" then + return "\\f" + elseif c == '"' then + return '\\"' + elseif c == '\\' then + return '\\\\' + else + return string.format("\\u%04x", c:byte()) + end +end + +local chars_to_be_escaped_in_JSON_string + = '[' + .. '"' -- class sub-pattern to match a double quote + .. '%\\' -- class sub-pattern to match a backslash + .. '%z' -- class sub-pattern to match a null + .. '\001' .. '-' .. '\031' -- class sub-pattern to match control characters + .. ']' + +local function json_string_literal(value) + local newval = value:gsub(chars_to_be_escaped_in_JSON_string, backslash_replacement_function) + return '"' .. newval .. '"' +end + +local function object_or_array(self, T, etc) + -- + -- We need to inspect all the keys... if there are any strings, we'll convert to a JSON + -- object. If there are only numbers, it's a JSON array. + -- + -- If we'll be converting to a JSON object, we'll want to sort the keys so that the + -- end result is deterministic. + -- + local string_keys = { } + local number_keys = { } + local number_keys_must_be_strings = false + local maximum_number_key + + for key in pairs(T) do + if type(key) == 'string' then + table.insert(string_keys, key) + elseif type(key) == 'number' then + table.insert(number_keys, key) + if key <= 0 or key >= math.huge then + number_keys_must_be_strings = true + elseif not maximum_number_key or key > maximum_number_key then + maximum_number_key = key + end + else + self:onEncodeError("can't encode table with a key of type " .. type(key), etc) + end + end + + if #string_keys == 0 and not number_keys_must_be_strings then + -- + -- An empty table, or a numeric-only array + -- + if #number_keys > 0 then + return nil, maximum_number_key -- an array + elseif tostring(T) == "JSON array" then + return nil + elseif tostring(T) == "JSON object" then + return { } + else + -- have to guess, so we'll pick array, since empty arrays are likely more common than empty objects + return nil + end + end + + table.sort(string_keys) + + local map + if #number_keys > 0 then + -- + -- If we're here then we have either mixed string/number keys, or numbers inappropriate for a JSON array + -- It's not ideal, but we'll turn the numbers into strings so that we can at least create a JSON object. + -- + + if self.noKeyConversion then + self:onEncodeError("a table with both numeric and string keys could be an object or array; aborting", etc) + end + + -- + -- Have to make a shallow copy of the source table so we can remap the numeric keys to be strings + -- + map = { } + for key, val in pairs(T) do + map[key] = val + end + + table.sort(number_keys) + + -- + -- Throw numeric keys in there as strings + -- + for _, number_key in ipairs(number_keys) do + local string_key = tostring(number_key) + if map[string_key] == nil then + table.insert(string_keys , string_key) + map[string_key] = T[number_key] + else + self:onEncodeError("conflict converting table with mixed-type keys into a JSON object: key " .. number_key .. " exists both as a string and a number.", etc) + end + end + end + + return string_keys, nil, map +end + +-- +-- Encode +-- +-- 'options' is nil, or a table with possible keys: +-- pretty -- if true, return a pretty-printed version +-- indent -- a string (usually of spaces) used to indent each nested level +-- align_keys -- if true, align all the keys when formatting a table +-- +local encode_value -- must predeclare because it calls itself +function encode_value(self, value, parents, etc, options, indent) + + if value == nil then + return 'null' + + elseif type(value) == 'string' then + return json_string_literal(value) + + elseif type(value) == 'number' then + if value ~= value then + -- + -- NaN (Not a Number). + -- JSON has no NaN, so we have to fudge the best we can. This should really be a package option. + -- + return "null" + elseif value >= math.huge then + -- + -- Positive infinity. JSON has no INF, so we have to fudge the best we can. This should + -- really be a package option. Note: at least with some implementations, positive infinity + -- is both ">= math.huge" and "<= -math.huge", which makes no sense but that's how it is. + -- Negative infinity is properly "<= -math.huge". So, we must be sure to check the ">=" + -- case first. + -- + return "1e+9999" + elseif value <= -math.huge then + -- + -- Negative infinity. + -- JSON has no INF, so we have to fudge the best we can. This should really be a package option. + -- + return "-1e+9999" + else + return tostring(value) + end + + elseif type(value) == 'boolean' then + return tostring(value) + + elseif type(value) ~= 'table' then + self:onEncodeError("can't convert " .. type(value) .. " to JSON", etc) + + else + -- + -- A table to be converted to either a JSON object or array. + -- + local T = value + + if type(options) ~= 'table' then + options = {} + end + if type(indent) ~= 'string' then + indent = "" + end + + if parents[T] then + self:onEncodeError("table " .. tostring(T) .. " is a child of itself", etc) + else + parents[T] = true + end + + local result_value + + local object_keys, maximum_number_key, map = object_or_array(self, T, etc) + if maximum_number_key then + -- + -- An array... + -- + local ITEMS = { } + for i = 1, maximum_number_key do + table.insert(ITEMS, encode_value(self, T[i], parents, etc, options, indent)) + end + + if options.pretty then + result_value = "[ " .. table.concat(ITEMS, ", ") .. " ]" + else + result_value = "[" .. table.concat(ITEMS, ",") .. "]" + end + + elseif object_keys then + -- + -- An object + -- + local TT = map or T + + if options.pretty then + + local KEYS = { } + local max_key_length = 0 + for _, key in ipairs(object_keys) do + local encoded = encode_value(self, tostring(key), parents, etc, options, indent) + if options.align_keys then + max_key_length = math.max(max_key_length, #encoded) + end + table.insert(KEYS, encoded) + end + local key_indent = indent .. tostring(options.indent or "") + local subtable_indent = key_indent .. string.rep(" ", max_key_length) .. (options.align_keys and " " or "") + local FORMAT = "%s%" .. string.format("%d", max_key_length) .. "s: %s" + + local COMBINED_PARTS = { } + for i, key in ipairs(object_keys) do + local encoded_val = encode_value(self, TT[key], parents, etc, options, subtable_indent) + table.insert(COMBINED_PARTS, string.format(FORMAT, key_indent, KEYS[i], encoded_val)) + end + result_value = "{\n" .. table.concat(COMBINED_PARTS, ",\n") .. "\n" .. indent .. "}" + + else + + local PARTS = { } + for _, key in ipairs(object_keys) do + local encoded_val = encode_value(self, TT[key], parents, etc, options, indent) + local encoded_key = encode_value(self, tostring(key), parents, etc, options, indent) + table.insert(PARTS, string.format("%s:%s", encoded_key, encoded_val)) + end + result_value = "{" .. table.concat(PARTS, ",") .. "}" + + end + else + -- + -- An empty array/object... we'll treat it as an array, though it should really be an option + -- + result_value = "[]" + end + + parents[T] = false + return result_value + end +end + + +function OBJDEF:encode(value, etc, options) + if type(self) ~= 'table' or self.__index ~= OBJDEF then + OBJDEF:onEncodeError("JSON:encode must be called in method format", etc) + end + return encode_value(self, value, {}, etc, options or nil) +end + +function OBJDEF:encode_pretty(value, etc, options) + if type(self) ~= 'table' or self.__index ~= OBJDEF then + OBJDEF:onEncodeError("JSON:encode_pretty must be called in method format", etc) + end + return encode_value(self, value, {}, etc, options or default_pretty_options) +end + +function OBJDEF.__tostring() + return "JSON encode/decode package" +end + +OBJDEF.__index = OBJDEF + +function OBJDEF:new(args) + local new = { } + + if args then + for key, val in pairs(args) do + new[key] = val + end + end + + return setmetatable(new, OBJDEF) +end + +return OBJDEF:new() + +-- +-- Version history: +-- +-- 20141223.14 The encode_pretty() routine produced fine results for small datasets, but isn't really +-- appropriate for anything large, so with help from Alex Aulbach I've made the encode routines +-- more flexible, and changed the default encode_pretty() to be more generally useful. +-- +-- Added a third 'options' argument to the encode() and encode_pretty() routines, to control +-- how the encoding takes place. +-- +-- Updated docs to add assert() call to the loadfile() line, just as good practice so that +-- if there is a problem loading JSON.lua, the appropriate error message will percolate up. +-- +-- 20140920.13 Put back (in a way that doesn't cause warnings about unused variables) the author string, +-- so that the source of the package, and its version number, are visible in compiled copies. +-- +-- 20140911.12 Minor lua cleanup. +-- Fixed internal reference to 'JSON.noKeyConversion' to reference 'self' instead of 'JSON'. +-- (Thanks to SmugMug's David Parry for these.) +-- +-- 20140418.11 JSON nulls embedded within an array were being ignored, such that +-- ["1",null,null,null,null,null,"seven"], +-- would return +-- {1,"seven"} +-- It's now fixed to properly return +-- {1, nil, nil, nil, nil, nil, "seven"} +-- Thanks to "haddock" for catching the error. +-- +-- 20140116.10 The user's JSON.assert() wasn't always being used. Thanks to "blue" for the heads up. +-- +-- 20131118.9 Update for Lua 5.3... it seems that tostring(2/1) produces "2.0" instead of "2", +-- and this caused some problems. +-- +-- 20131031.8 Unified the code for encode() and encode_pretty(); they had been stupidly separate, +-- and had of course diverged (encode_pretty didn't get the fixes that encode got, so +-- sometimes produced incorrect results; thanks to Mattie for the heads up). +-- +-- Handle encoding tables with non-positive numeric keys (unlikely, but possible). +-- +-- If a table has both numeric and string keys, or its numeric keys are inappropriate +-- (such as being non-positive or infinite), the numeric keys are turned into +-- string keys appropriate for a JSON object. So, as before, +-- JSON:encode({ "one", "two", "three" }) +-- produces the array +-- ["one","two","three"] +-- but now something with mixed key types like +-- JSON:encode({ "one", "two", "three", SOMESTRING = "some string" })) +-- instead of throwing an error produces an object: +-- {"1":"one","2":"two","3":"three","SOMESTRING":"some string"} +-- +-- To maintain the prior throw-an-error semantics, set +-- JSON.noKeyConversion = true +-- +-- 20131004.7 Release under a Creative Commons CC-BY license, which I should have done from day one, sorry. +-- +-- 20130120.6 Comment update: added a link to the specific page on my blog where this code can +-- be found, so that folks who come across the code outside of my blog can find updates +-- more easily. +-- +-- 20111207.5 Added support for the 'etc' arguments, for better error reporting. +-- +-- 20110731.4 More feedback from David Kolf on how to make the tests for Nan/Infinity system independent. +-- +-- 20110730.3 Incorporated feedback from David Kolf at http://lua-users.org/wiki/JsonModules: +-- +-- * When encoding lua for JSON, Sparse numeric arrays are now handled by +-- spitting out full arrays, such that +-- JSON:encode({"one", "two", [10] = "ten"}) +-- returns +-- ["one","two",null,null,null,null,null,null,null,"ten"] +-- +-- In 20100810.2 and earlier, only up to the first non-null value would have been retained. +-- +-- * When encoding lua for JSON, numeric value NaN gets spit out as null, and infinity as "1+e9999". +-- Version 20100810.2 and earlier created invalid JSON in both cases. +-- +-- * Unicode surrogate pairs are now detected when decoding JSON. +-- +-- 20100810.2 added some checking to ensure that an invalid Unicode character couldn't leak in to the UTF-8 encoding +-- +-- 20100731.1 initial public release +-- diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cf1ab25 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/README.md b/README.md new file mode 100644 index 0000000..ad7dc4f --- /dev/null +++ b/README.md @@ -0,0 +1,184 @@ +urls-grab +============= + +More information about the archiving project can be found on the ArchiveTeam wiki: [URLs](http://archiveteam.org/index.php?title=URLs) + +Setup instructions +========================= + +Be sure to replace `YOURNICKHERE` with the nickname that you want to be shown as, on the tracker. You don't need to register it, just pick a nickname you like. + +In most of the below cases, there will be a web interface running at http://localhost:8001/. If you don't know or care what this is, you can just ignore it—otherwise, it gives you a fancy view of what's going on. + +**If anything goes wrong while running the commands below, please scroll down to the bottom of this page. There's troubleshooting information there.** + +Running with a warrior +------------------------- + +Follow the [instructions on the ArchiveTeam wiki](http://archiveteam.org/index.php?title=Warrior) for installing the Warrior, and select the "URLs" project in the Warrior interface. + +Running without a warrior +------------------------- +To run this outside the warrior, clone this repository, cd into its directory and run: + + python3 -m pip install setuptools wheel + python3 -m pip install --upgrade seesaw zstandard requests + ./get-wget-lua.sh + +then start downloading with: + + run-pipeline3 pipeline.py --concurrent 2 YOURNICKHERE + +For more options, run: + + run-pipeline3 --help + +If you don't have root access and/or your version of pip is very old, you can replace "pip install --upgrade seesaw" with: + + wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py ; python3 get-pip.py --user ; ~/.local/bin/pip3 install --upgrade --user seesaw + +so that pip and seesaw are installed in your home, then run + + ~/.local/bin/run-pipeline3 pipeline.py --concurrent 2 YOURNICKHERE + +Running multiple instances on different IPs +------------------------------------------- + +This feature requires seesaw version 0.0.16 or greater. Use `pip install --upgrade seesaw` to upgrade. + +Use the `--context-value` argument to pass in `bind_address=123.4.5.6` (replace the IP address with your own). + +Example of running 2 threads, no web interface, and Wget binding of IP address: + + run-pipeline3 pipeline.py --concurrent 2 YOURNICKHERE --disable-web-server --context-value bind_address=123.4.5.6 + +Distribution-specific setup +------------------------- +### For Debian/Ubuntu: + +Package `libzstd-dev` version 1.4.4 is required which is currently available from `buster-backports`. + + adduser --system --group --shell /bin/bash archiveteam + echo deb http://deb.debian.org/debian buster-backports main contrib > /etc/apt/sources.list.d/backports.list + apt-get update \ + && apt-get install -y git-core libgnutls-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen bzip2 zlib1g-dev flex autoconf autopoint texinfo gperf lua-socket rsync automake pkg-config python3-dev python3-pip build-essential \ + && apt-get -t buster-backports install zstd libzstd-dev libzstd1 + python3 -m pip install setuptools wheel + python3 -m pip install --upgrade seesaw zstandard requests + su -c "cd /home/archiveteam; git clone https://github.com/ArchiveTeam/urls-grab.git; cd urls-grab; ./get-wget-lua.sh" archiveteam + screen su -c "cd /home/archiveteam/urls-grab/; run-pipeline3 pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE" archiveteam + [... ctrl+A D to detach ...] + +In __Debian Jessie, Ubuntu 18.04 Bionic and above__, the `libgnutls-dev` package was renamed to `libgnutls28-dev`. So, you need to do the following instead: + + adduser --system --group --shell /bin/bash archiveteam + echo deb http://deb.debian.org/debian buster-backports main contrib > /etc/apt/sources.list.d/backports.list + apt-get update \ + && apt-get install -y git-core libgnutls28-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen bzip2 zlib1g-dev flex autoconf autopoint texinfo gperf lua-socket rsync automake pkg-config python3-dev python3-pip build-essential \ + && apt-get -t buster-backports install zstd libzstd-dev libzstd1 + [... pretty much the same as above ...] + +Wget-lua is also available on [ArchiveTeam's PPA](https://launchpad.net/~archiveteam/+archive/wget-lua) for Ubuntu. + +### For CentOS: + +Ensure that you have the CentOS equivalent of bzip2 installed as well. You will need the EPEL repository to be enabled. + + yum -y groupinstall "Development Tools" + yum -y install gnutls-devel lua-devel python-pip zlib-devel zstd libzstd-devel git-core gperf lua-socket luarocks texinfo git rsync gettext-devel + pip install --upgrade seesaw + [... pretty much the same as above ...] + +Tested with EL7 repositories. + +### For Fedora: + +The same as CentOS but with "dnf" instead of "yum". Did not successfully test compiling, so far. + +### For openSUSE: + + zypper install liblua5_1 lua51 lua51-devel screen python-pip libgnutls-devel bzip2 python-devel gcc make + pip install --upgrade seesaw + [... pretty much the same as above ...] + +### For OS X: + +You need Homebrew. Ensure that you have the OS X equivalent of bzip2 installed as well. + + brew install python lua gnutls + pip install --upgrade seesaw + [... pretty much the same as above ...] + +**There is a known issue with some packaged versions of rsync. If you get errors during the upload stage, urls-grab will not work with your rsync version.** + +This supposedly fixes it: + + alias rsync=/usr/local/bin/rsync + +### For Arch Linux: + +Ensure that you have the Arch equivalent of bzip2 installed as well. + +1. Make sure you have `python2-pip` installed. +2. Install [the wget-lua package from the AUR](https://aur.archlinux.org/packages/wget-lua/). +3. Run `pip2 install --upgrade seesaw`. +4. Modify the run-pipeline script in seesaw to point at `#!/usr/bin/python2` instead of `#!/usr/bin/python`. +5. `useradd --system --group users --shell /bin/bash --create-home archiveteam` +6. `screen su -c "cd /home/archiveteam/urls-grab/; run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE" archiveteam` + +### For Alpine Linux: + + apk add lua5.1 git python bzip2 bash rsync gcc libc-dev lua5.1-dev zlib-dev gnutls-dev autoconf flex make + python -m ensurepip + pip install -U seesaw + git clone https://github.com/ArchiveTeam/urls-grab + cd urls-grab; ./get-wget-lua.sh + run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE + +### For FreeBSD: + +Honestly, I have no idea. `./get-wget-lua.sh` supposedly doesn't work due to differences in the `tar` that ships with FreeBSD. Another problem is the apparent absence of Lua 5.1 development headers. If you figure this out, please do let us know on IRC (irc.efnet.org #archiveteam). + +Troubleshooting +========================= + +Broken? These are some of the possible solutions: + +### wget-lua was not successfully built + +If you get errors about `wget.pod` or something similar, the documentation failed to compile - wget-lua, however, compiled fine. Try this: + + cd get-wget-lua.tmp + mv src/wget ../wget-lua + cd .. + +The `get-wget-lua.tmp` name may be inaccurate. If you have a folder with a similar but different name, use that instead and please let us know on IRC what folder name you had! + +Optionally, if you know what you're doing, you may want to use wgetpod.patch. + +### Problem with gnutls or openssl during get-wget-lua + +Please ensure that gnutls-dev(el) and openssl-dev(el) are installed. + +### ImportError: No module named seesaw + +If you're sure that you followed the steps to install `seesaw`, permissions on your module directory may be set incorrectly. Try the following: + + chmod o+rX -R /usr/local/lib/python2.7/dist-packages + +### run-pipeline: command not found + +Install `seesaw` using `pip2` instead of `pip`. + + pip2 install seesaw + +### Issues in the code + +If you notice a bug and want to file a bug report, please use the GitHub issues tracker. + +Are you a developer? Help write code for us! Look at our [developer documentation](http://archiveteam.org/index.php?title=Dev) for details. + +### Other problems + +Have an issue not listed here? Join us on IRC and ask! We can be found at hackint IRC [#//](https://webirc.hackint.org/#irc://irc.hackint.org/#//). + diff --git a/bad-params.txt b/bad-params.txt new file mode 100644 index 0000000..24dcbb0 --- /dev/null +++ b/bad-params.txt @@ -0,0 +1,64 @@ +utm_source +utm_medium +utm_campaign +utm_term +utm_content +utm_adgroup +ref +refsrc +referrer_id +referrerid +src +i +s +ts +feature +jsessionid +phpsessid +aspsessionid +sessionid +zenid +sid +gclid +fb_xd_fragment +fb_comment_id +fbclid +cfid +cftoken +doing_wp_cron +pk_cpn +pk_campaign +pk_kwd +pk_keyword +piwik_campaign +piwik_kwd +ga_source +ga_medium +ga_term +ga_content +ga_campaign +ga_place +yclid +_openstat +fb_action_ids +fb_action_types +fb_source +fb_ref +action_object_map +action_type_map +action_ref_map +gs_l +mkt_tok +hmb_campaign +hmb_medium +hmb_source +rand +wicket:antiCache +cachebuster +nocache +vs +dilid +script_case_session +cid +extid +_flowexecutionkey diff --git a/bad-patterns.txt b/bad-patterns.txt new file mode 100644 index 0000000..9427158 --- /dev/null +++ b/bad-patterns.txt @@ -0,0 +1,33 @@ +/action/consumeSharedSessionAction +/action/consumeSsoCookie +/action/getSharedSiteSession +/juris/error%.jsf +facebook%.com/login%.php +facebook%.com/cookie/ +facebook%.com/plugins/ +facebook%.com/sharer/ +facebook%.com/sharer%.php +gongquiz%.com.+&historyNo=[0-9]+ +univis%.univie%.ac%.at/ausschreibungstellensuche/ +fundraise%.cancerresearchuk%.org/signup/account/ +mma%.ft%.com +^https?://dmg%.go%-2b%-planer%.de/ +^https?://3d%.espace%-aubade%.fr/ +^https?://kuechenplaner%.[^/]+/cloud/ +^https?://3d%-salledebains%.geberit%.fr/ +^https?://bibliotekanauki%.ceon%.pl/yadda/search/general%.action +^https?://[^/]+%.icm%.edu%.pl/.*search/article%.action +^https?://interamt%.de/koop/app/ +^https?://tesiunam%.dgb%.unam%.mx/F/ +^https?://[^%.]+%.sedelectronica%.es/.*%?x= +^https?://www%.cp%-cc%.org/programs%-services/ +/ibank/_crypt_ +%%7B%%7B.+%%7D%%7D +^https?://[^/]+/" +^http://[0-9a-z][0-9a-z][0-9a-z][0-9][0-9][0-9]?%.[^%./]+%.com/$ +^http://[0-9a-z][0-9a-z][0-9a-z][0-9][0-9][0-9]?%.[^%./]+%.com/[a-z]+%.?[a-z][a-z][a-z]?$ +^http://[0-9a-z][0-9a-z][0-9a-z][0-9][0-9][0-9]?%.[^%./]+%.com/[a-z]+/[a-z]+[0-9]*%.?[a-z][a-z][a-z]?$ +^https?://[^/]*yahoo%.com/.+%%5C.+at%.atwola%.com +^https?://[^/]*at%.atwola%.com/ +^https?://www%.bafa%.de/ +%%5C%%22 diff --git a/extract-outlinks-patterns.txt b/extract-outlinks-patterns.txt new file mode 100644 index 0000000..bfa4645 --- /dev/null +++ b/extract-outlinks-patterns.txt @@ -0,0 +1,1112 @@ +15min.org +15minut.org +1prime.biz +24-ore.com +24.ae +7days.ae +8am.af +aamulehti.fi +abc.news.go.com +abstvradio.com +accringtonobserver.co.uk +acn.cu +ad.nl +adelante.cu +adigea.aif.ru +adsports.ae +adventure.nationalgeographic.com +af.farsnews.com +af.reuters.com +afghan-review.com +afghanews.ir +afghanislamicpress.com +afghanistannewscenter.com +afghanistansun.com +afghanistantimes.af +afghanpaper.com +aftenposten.no +aftonbladet.se +agora.co.ao +ahora.cu +ahram.org.eg +aif.ru +aiis-albania.org +airdrieecho.com +akhbarelyoum.dz +akhersaa-dz.com +al-fadjr.com +al-monitor.com +alalam.ir +alarabiya.net +albawabaeg.com +albayan.ae +albertafarmexpress.ca +alfajr-news.net +alger-info.com +algerie-focus.com +algerieconfluences.com +alhadath.net +alittihad.ae +aljazeera.com +alkhaleej.ae +alkmaarnieuws.nl +almasdarnews.com +almogaz.com +alquds.co.uk +alroeya.ae +alrugby.com +alseyassi-dz.com +alwahdanews.ae +alwatannewspaper.ae +am.radiovaticana.va +ambito.com +ameinfo.com +ana.ad +andorradifusio.ad +angliya.com +angonoticias.com +annasrdz.com +annasronline.com +antiguaobserver.com +aopnews.com +ap22.ru +apple.com +appledaily.com.tw +appleinsider.com +appleinsider.net +appleinsider.ru +aps.dz +ar.farsnews.com +ar.radiovaticana.va +ar.reuters.com +ar.timesofisrael.com +ara.ad +ara.reuters.com +arabianbusiness.com +arabic.sport360.com +ariananet.com +arnhemnieuws.nl +arstechnica.com +asia.nikkei.com +aspi.org.au +aspistrategist.org.au +astro.fashion.qq.com +aswatmasriya.com +atlantic.ctvnews.ca +ausairpower.net +auto.qq.com +autonews.ru +b.dk +baby.qq.com +bakhtarnews.com.af +balkanweb.com +banglaexpress.ae +barkinganddagenhampost.co.uk +barrie.ctvnews.ca +barrietoday.com +baytoday.ca +bbc.co.uk +bbc.com +bc.ctvnews.ca +be.radiovaticana.va +becclesandbungayjournal.co.uk +belegger.nl +benawa.com +bencarson.com +berlinergazette.de +berniesanders.com +bet.nl +beuningennieuws.nl +bexleytimes.co.uk +bg.radiovaticana.va +bgr.com +bigstory.ap.org +birminghammail.co.uk +birminghampost.co.uk +biz.tpo.nl +blackcountrybugle.co.uk +blikopnieuws.nl +blog.archive.org +blog.cleveland.com +blog.lesoir.be +blogs.canoe.com +blogs.wsj.com +blogues.canoe.ca +bloomberg.co.jp +bloomberg.com +bna.bh +bnn.ca +bnr.nl +boingboing.net +bondebladet.no +bondia.ad +book.qq.com +boxmeernieuws.nl +br.radiovaticana.va +br.reuters.com +br.wsj.com +brantfordexpositor.ca +bredajournaal.nl +breitbart.com +brisinst.org.au +bromleytimes.co.uk +bt.dk +btp-dz.com +buenosairesherald.com +burymercury.co.uk +buzzfeed.com.au +ca.reuters.com +calgary.ctvnews.ca +calgaryherald.com +calgarysun.com +cambstimes.co.uk +cameroonpostline.com +camrosecanadian.com +cankaoxiaoxi.com +capitalnewyork.com +catholicherald.co.uk +ceda.com.au +cesd.az +channel4.com +channelnewsasia.com +chathamthisweek.com +chealth.canoe.com +chesterchronicle.co.uk +china.kyodonews.jp +chinatimes.com +chosonsinbo.com +chosun.com +chrischristie.com +chron.com +chroniclelive.co.uk +chroom.tpo.nl +cis.org.au +citynews.ca +clarin.com +class.qq.com +cleveland.com +clintonnewsrecord.com +cn.ibtimes.com +cn.nytimes.com +cn.timesofisrael.com +cn.wsj.com +cnbc.com +cnea.gov.ar +cnet.com +cnews.ru +coastalscene24.co.uk +cochranetimes.com +cochranetimespost.ca +collections.unu.edu +competition.dz +computersweden.idg.se +conae.gov.ar +conicet.gov.ar +contenidos.lanacion.com.ar +coventryobserver.co.uk +coventrytelegraph.net +cp24.com +cpd.org.au +cphpost.dk +cranbrookherald.com +crewechronicle.co.uk +cronica.com.ar +cronicamendoza.com +cs.radiovaticana.va +ctvnews.ca +cubainfo.acn.cu +cubanews.acn.cu +cubasi.com +cubasi.cu +cuijknieuws.nl +cul.qq.com +cult.tpo.nl +cultofandroid.com +cultofandroid.com.feedsportal.com +cultofmac.com +cultofmac.com.feedsportal.com +cultofmac.com.ua +cybersecuritydojo.com +czechcrunch.cz +dagelijksestandaard.nl +dagen.no +dagen.se +dagens.dk +dagogtid.no +dagsavisen.no +daily-mail.co.zm +dailyafghanistan.com +dailyfinance.com +dailyheraldtribune.com +dailymail.co.uk +dailynewsegypt.com +dailynk.com +dailypost.co.uk +dailyrecord.co.uk +dailystar.co.uk +dailystar.com.lb +dajia.qq.com +dari.wadsam.com +data.gdeltproject.org +de.radiovaticana.va +de.reuters.com +delfi.lt +demokraatti.fi +demorgen.be +denboschnieuws.nl +depechedekabylie.com +derehamtimes.co.uk +destructoid.com +deutschlandradio.de +deventerjournaal.nl +devpolicy.crawford.anu.edu.au +devpolicy.org +di.se +diariandorra.ad +diariobae.com +diariopopular.com.ar +diarioshow.com +digi.tech.qq.com +dissmercury.co.uk +dn.no +dn.se +dnaindia.com +dnd.nl +docsalud.com +donaldjtrump.com +donbalon.com +donbalon.eu +dp.ru +dprktoday.com +dr.dk +draytonvalleywesternreview.com +dubaichronicle.com +dunmowbroadcast.co.uk +dutchdailynews.com +dutchinamerica.com +dutchnews.nl +dw.com +eaber.org +eadt.co.uk +eastasiaforum.org +eastlondonadvertiser.co.uk +ech-chaab.com +echoroukonline.com +economictimes.indiatimes.com +edition.cnn.com +edmonton.ctvnews.ca +edmontonjournal.com +edmontonsun.com +edsonleader.com +edu.qq.com +eg-online.ru +eindhovennieuws.nl +ekstrabladet.dk +el-hakaek.com +el-hourria.com +el-massa.com +el-youm.info +elahdath.net +elbilad.net +elciudadanoweb.com +elcolombiano.com +eldjoumhouria.dz +electronicintifada.net +elheddaf.com +elkhabar.com +elkhabarerriadhi.com +elliotlaketoday.com +elmakam.com +elmassar-ar.com +elmoudjahid.com +elperiodic.ad +elraaed.com +elsevier.nl +elwatan.com +elystandard.co.uk +emaratalyoum.com +emirates247.com +en.alalam.ir +en.aswatmasriya.com +en.farsnews.com +en.gigazine.net +en.hawarnews.com +en.novayagazeta.ru +en.radiovaticana.va +en.video.canoe.tv +engadget.com +english.ahram.org.eg +english.chosun.com +english.juventudrebelde.cu +english.kyodonews.jp +english.yonhapnews.co.kr +ennaharonline.com +ent.qq.com +entv.dz +environment.nationalgeographic.com +eo.radiovaticana.va +eqmweekly.com.af +es.hawarnews.com +es.radiovaticana.va +es.reuters.com +escambray.cu +ess.fi +etn.fi +eufin.nl +euronews.com +evatt.org.au +eveningnews24.co.uk +exame.co.ao +examiner.co.uk +exiledonline.com +exmouthherald.co.uk +exmouthjournal.co.uk +express.co.uk +expressandstar.com +expressen.se +fa.timesofisrael.com +fakenhamtimes.co.uk +farsnews.com +fashion.qq.com +fd.nl +feeds.24.com +feeds.arstechnica.com +feeds.bbci.co.uk +feeds.cnevids.com +feeds.feedburner.com +feeds.feedburner.jp +feeds.gawker.com +feeds.government.nl +feeds.huffingtonpost.com +feeds.ign.com +feeds.kauppalehti.fi +feeds.macrumors.com +feeds.mashable.com +feeds.news24.com +feeds.nytimes.com +feeds.sciencedaily.com +feeds.skynews.com +feeds.washingtonpost.com +feeds.webwereld.nl +feeds.wsjonline.com +feeds2.feedburner.com +feweek.co.uk +fi.radiovaticana.va +fightland.vice.com +finance.qq.com +#finance.yahoo.com +fiskeribladet.no +flip.channelnewsasia.com +forbes.com +fortmcmurraytoday.com +fortsaskatchewanrecord.com +forum.ad +foxnews.com +foxue.qq.com +fr.canoe.ca +fr.radiovaticana.va +fr.reuters.com +fr.timesofisrael.com +fr.video.canoe.tv +france24.com +frenchwam.com +friheten.no +frontpage.fok.nl +ft.com +ftp3.conae.gov.ar +fullfact.org +futbolete.com +games.qq.com +gamespy.dk +gazeta-pravda.ru +gazeta55.al +gazetanovgorod.ru +gazetayakutia.ru +gazettelive.co.uk +gazettetimes.com +getbucks.co.uk +gethampshire.co.uk +getreading.co.uk +getsurrey.co.uk +getwestlondon.co.uk +gfwadvertiser.ca +gigazine.net +gizmodo.com +globalnews.ca +godubai.com +gongyi.qq.com +googleblog.blogspot.com +googleblog.blogspot.nl +gov.uk +government.nl +gp.se +granma.cu +grattan.edu.au +gravesendreporter.co.uk +greatyarmouthmercury.co.uk +greenun24.co.uk +groningenjournaal.nl +guardian.ng +guccifer2.wordpress.com +guelphtoday.com +guerrillero.cu +gulfnews.com +gulftoday.ae +guruwatch.nl +gva.be +haaretz.co.il +haaretz.com +hackneygazette.co.uk +halifaxtoday.ca +hamhigh.co.uk +hamhighbroadway.co.uk +hannaherald.com +hardenbergnieuws.nl +hawarnews.com +hbl.fi +hd.stheadline.com +he.radiovaticana.va +health.qq.com +health.usnews.com +helsinkitimes.fi +heraldlive.co.za +hertsad.co.uk +heute.de +heyetnet.org +hi.radiovaticana.va +highrivertimes.com +hillaryclinton.com +hinckleytimes.net +hintonparklander.com +hk.on.cc +hln.be +horizons-dz.com +house.qq.com +hr.radiovaticana.va +hrnicholls.com.au +hs.fi +hu.radiovaticana.va +huffingtonpost.com +huntspost.co.uk +hy.radiovaticana.va +i-d.vice.com +iamexpat.nl +ib.edu.ar +ibinda.com +ibtimes.co.in +ibtimes.co.uk +ibtimes.com +ibtimes.com.au +icelandreview.com +idag.no +iex.nl +iexgeld.nl +iexprofs.nl +ilfordrecorder.co.uk +ilkka.fi +iltalehti.fi +iltasanomat.fi +in.reuters.com +independent.co.uk +indianexpress.com +infocanuelas.com +infosoir.com +infoworld.com +inta.gob.ar +intelligencer.ca +international.nytimes.com +internationalaffairs.org.au +inti.gob.ar +inti.gov.ar +invasor.cu +io-tech.fi +ipa.org.au +ipolitics.ca +ips.cap.anu.edu.au +ipswichstar.co.uk +iraq-amsi.net +irna.ir +islingtongazette.co.uk +it.ibtimes.com +it.reuters.com +itv.com +itviikko.fi +izvestia.ru +ja.radiovaticana.va +japantimes.co.jp +jeugdjournaal.nl +jeune-independant.net +jia360.com +johnkasich.com +joop.nl +jornaldeangola.sapo.ao +jornaldosdesportos.sapo.ao +jornalf8.net +journaldemontreal.com +jp.ibtimes.com +jp.reuters.com +jp.techcrunch.com +jp.vice.com +jp.wsj.com +juventudrebelde.cu +jyllands-posten.dk +kabayanweekly.com +kabulpress.org +kaleva.fi +kansalainen.fi +kansanuutiset.fi +karjalainen.fi +karjalansanomat.ru +kawalisse.com +kbctv.co.ke +kenoradailyminerandnews.com +kentnews.co.uk +kentonline.co.uk +khaama.com +khabarafghan.com +khaleejtimes.com +kid.qq.com +kids.nationalgeographic.com +kilburntimes.co.uk +kincardinenews.com +kingstonthisweek.com +kitchener.ctvnews.ca +klassekampen.no +kodima.rkperiodika.ru +kohajone.com +kommersant.ru +koreatimes.co.kr +kotaku.com +kp.ru +kr.nknews.org +kr.radiovaticana.va +kristeligt-dagblad.dk +ksml.fi +ktimes.com +ku.hawarnews.com +lacapital.com.ar +lactualite-dz.info +lakeshoreadvance.com +lanacion.com.ar +lanueva.com +lapinkansa.fi +laprensa.com.ar +lapresse.tn +larawbar.net +larazon.com.ar +lat.wsj.com +latimes.com +latribune-dz.com +lautomarche.com +lavoz.com.ar +lawandtax-news.com +leaderpost.com +lejourdalgerie.com +leloir.org.ar +lemaghrebdz.com +lematindz.net +lemauricien.com +lemidi-dz.com +lemonde.fr +leparisien.fr +lequotidien-oran.com +lesnouvellesnews.fr +lesoir.be +lesoirdalgerie.com +lestrepublicain.com +letempsdz.com +lexpressiondz.com +lfpress.com +lgz.ru +liberte-algerie.com +libyaherald.com +lifehacker.com +live.huffingtonpost.com +liveleak.com +liverpoolecho.co.uk +lnr-dz.com +london.ctvnews.ca +looopings.nl +losandes.com.ar +loughboroughecho.net +lowestoftjournal.co.uk +lowyinstitute.org +lrt.lt +lt.radiovaticana.va +lta.reuters.com +ltn.com.tw +lv.radiovaticana.va +maaseuduntulevaisuus.fi +macclesfield-express.co.uk +mackungfu.org +macleans.ca +macrumors.com +macrumors.ro +madagascar-tribune.com +madamasr.com +mailonsunday.co.uk +managementherald.com.ar +manchestereveningnews.co.uk +mandegardaily.com +mannkal.org +mannwest.com +marcorubio.com +marketwatch.com +marmai.fi +marsad.ly +mashable.com +mashable.pw +mayerthorpefreelancer.com +media.tpo.nl +menziesrc.org +meridianbooster.com +mes.ad +metro.co.uk +metro.fi +metro.se +metrohk.com.hk +metronews.ca +metronieuws.nl +mg.co.za +middleeasteye.net +midnorthmonitor.com +midweekherald.co.uk +mikrobitti.fi +mil.qq.com +mingpao.com +mirror.co.uk +mk.radiovaticana.va +mk.ru +mkset.ru +ml.radiovaticana.va +mn.ru +mobilefeeds.wsj.com +moheet.com +money.rbc.ru +money.usnews.com +monitor.co.ug +montreal.ctvnews.ca +montrealgazette.com +morgenbladet.no +morningstaronline.co.uk +mospravda.ru +motherboard.vice.com +motors-dz.com +mountain-news.com +msnbc.com +munchies.vice.com +mundod.lavoz.com.ar +mx.dk +mx.reuters.com +naenara.com.kp +nanaimodailynews.com +nantonnews.com +nasdaq.com +nation.co.ke +nationalobserver.com +nationalpost.com +nationen.no +navbharattimes.indiatimes.com +nbcnews.com +nd.nl +nederlandnieuws.nl +nerjanieuws.nl +newburytoday.co.uk +newhamrecorder.co.uk +newizv.ru +newlookmedia.ru +news.com.au +news.cubasi.cu +news.ltn.com.tw +news.mingpao.com +news.nationalgeographic.com +news.nationalpost.com +news.qq.com +news.sky.com +news.tbs.co.jp +news.vice.com +news.vip-urlaub.de +news.yahoo.com +news24.com +newscentralasia.net +newsletter.co.uk +newsmonkey.be +newsrss.bbc.co.uk +newtimes.co.rw +newvision.co.ug +ng.ru +niagarafallsreview.ca +nieuws.tpo.nl +nijmegennieuws.nl +nikkei.com +nisnews.nl +nknews.org +nltimes.nl +noisey.vice.com +north-africa.com +northdevongazette.co.uk +northernontario.ctvnews.ca +northnorfolknews.co.uk +northsomersettimes.co.uk +norwichadvertiser24.co.uk +norwichgazette.com +nos.nl +notinet.icrt.cu +novayagazeta.ru +novojornal.co.ao +novosti.acn.cu +npr.org +nrc.nl +nrk.no +nsl-basketball.sport360.com +nsl-football.sport360.com +nsl.sport360.com +nu.nl +nugget.ca +nunatsiaqonline.ca +nycity.today +nyheder.tv2.dk +nypost.com +nytid.no +nytimes.com +nzherald.co.nz +o.canada.com +og.ru +ohio.com +one.iex.nl +onionstudios.com +opais.co.ao +orientaldaily.on.cc +osservatoreromano.va +ossnieuws.nl +ottawa.ctvnews.ca +ottawacitizen.com +ottawasun.com +ouarsenis.com +ouest-france.fr +ouestribune-dz.com +ourworld.unu.edu +outlookafghanistan.net +owensoundsuntimes.com +oxfordtimes.co.uk +pagina12.com.ar +pajhwok.com +panorama-sport.com +panorama.com.al +parool.nl +participaties.nl +pdc.tv +percapita.org.au +periodico26.cu +photography.nationalgeographic.com +pinchercreekecho.com +pl.radiovaticana.va +pm.gc.ca +pnp.ru +politico.com +politico.eu +politiek.tpo.nl +politifact.com +polygon.com +portalangop.co.ao +portfolio.lesoir.be +postzambia.com +powned.tv +pqbnews.com +pressandjournal.co.uk +presstv.ir +prnewsonline.com +prosper.org.au +province.ru +prrecordgazette.com +pt.radiovaticana.va +qq.com +quote.rbc.ru +quotidien-oran.com +radio.nrk.no +radioalgerie.dz +radiolome.tg +randpaul.com +raqqa-sl.com +rawstory.com +rbc.ru +rbth.com +readwrite.com +recorder.ca +redstar.ru +refdag.nl +regina.ctvnews.ca +regio.tpo.nl +republicoftogo.com +reuters.com +rg.ru +ria.ru +rionegro.com.ar +ro.radiovaticana.va +rodong.rep.kp +romfordrecorder.co.uk +rossendalefreepress.co.uk +royston-crow.co.uk +rss.canada.com +rss.canoe.com +rss.cnn.com +rss.dw.com +rss.feedsportal.com +rss.nytimes.com +rss.upi.com +rt.com +rtl7darts.nl +rtlnieuws.nl +ru.hawarnews.com +ru.radiovaticana.va +ru.reuters.com +rumbosdigital.com +ruokala.net +ruscur.ru +sabawoon.com +sackvilletribunepost.com +saffronwaldenreporter.co.uk +sam.az +sammobile.com +sargasso.nl +saskatoon.ctvnews.ca +satakunnankansa.fi +saultstar.com +savonsanomat.fi +sawt-alahrar.net +sci-news.com +sciencedaily.com +sciencenews.org +scotlandnow.dailyrecord.co.uk +semanarioeconomico.co.ao +sfgate.com +sidmouthherald.co.uk +siliconprairienews.com +simcoereformer.ca +sk.radiovaticana.va +sl.radiovaticana.va +sobesednik.ru +sootoday.com +sot.com.al +southportvisiter.co.uk +sovsakh.ru +sovsport.ru +spbvedomosti.ru +spiegel.de +sport-express.ru +sport.rbc.ru +sport360.com +sports.qq.com +sports.vice.com +#sports.yahoo.com +sq.radiovaticana.va +standaard.be +standard-freeholder.com +standard.co.uk +static.feed.rbc.ru +stcatharinesstandard.ca +std.stheadline.com +stheadline.com +stock.qq.com +stowmarketmercury.co.uk +stratfordbeaconherald.com +strathmorestandard.com +stthomastimesjournal.com +student.societyforscience.org +sudburymercury.co.uk +sunnewsonline.com +suomenmaa.fi +suomenuutiset.fi +super.ae +sustg.com +sv.radiovaticana.va +svd.se +svenska.yle.fi +svt.se +sw.radiovaticana.va +ta.radiovaticana.va +taand.com +tagesschau.de +tai.org.au +taipeitimes.com +talk.tpo.nl +taloussanomat.fi +tchina.kyodonews.jp +tech.qq.com +techcrunch.asia +techcrunch.cn +techcrunch.com +techradar.me +tedcruz.org +tehrantimes.com +tekniikanmaailma.fi +telegraaf.nl +telegraph.co.uk +thanhnien.vn +the-japan-news.com +theantiguan.com +thearabianpost.com +theatlantic.com +theautonet.com +thebeaverton.com +thechronicleherald.ca +thecomet.net +thecragandcanyon.ca +thecreatorsproject.vice.com +thedailyobserver.ca +thedailystar.net +theglobeandmail.com +theguardian.com +thehindu.com +theindependent.co.zw +theintercept.com +thelocal.fr +themoscowtimes.com +thenational.ae +thenationalstudent.com +thenextweb.com +theonion.com +thepeterboroughexaminer.com +theprovince.com +theregister.co.uk +therwandan.com +thestage.co.uk +thestandard.com.hk +thestar.com +thestarphoenix.com +thesudburystar.com +thesun.co.uk +thesydneyinstitute.com.au +thetfordandbrandontimes.co.uk +thetimes.co.uk +theverge.com +theweathernetwork.com +thewestonmercury.co.uk +thewhig.com +thisdaylive.com +thump.vice.com +ti.radiovaticana.va +tielnieuws.nl +tilburgnieuws.nl +time.com +times.co.zm +timescolonist.com +timesofindia.indiatimes.com +timesofisrael.com +timminspress.com +timminstoday.com +tivi.fi +tmz.com +today.ng +todayszaman.com +togozine.com +tolafghan.com +tomshardware.com +toronto.ctvnews.ca +torontosun.com +torrentfreak.com +tpo.nl +tr.farsnews.com +tr.hawarnews.com +trabajadores.cu +transactiondalgerie.com +travel.nationalgeographic.com +travel.usnews.com +tribune.com.pk +trouw.nl +trud.ru +ts.fi +tumentoday.ru +tuoitrenews.vn +tv.echoroukonline.com +tv.rbc.ru +tverlife.ru +tvt.tg +tweakers.net +twenterandnieuws.nl +uaeinteract.com +udennieuws.nl +udn.com +uk.radiovaticana.va +uk.reuters.com +unu.edu +upi.com +uriminzokkiri.com +usatoday.com +usnews.com +ussc.edu.au +utrechtjournaal.nl +uusisuomi.fi +v.qq.com +vancouverisland.ctvnews.ca +vancouverobserver.com +vanguardia.cu +vanguardngr.com +vaterland.li +vechorka.ru +vedomosti.ru +veghelnieuws.nl +veintitres.com.ar +vendingtimes.com +verkkouutiset.fi +vi.radiovaticana.va +vice.cn +vice.com +viceland.com +video.asia.nikkei.com +video.cnbc.com +video.nationalgeographic.com +video.usnews.com +video.vice.com +video.wired.com +videos.leparisien.fr +vihrealanka.fi +vl.no +vmnews.ru +vn.ru +volkskrant.nl +vos.lavoz.com.ar +vremya.ru +vulcanadvocate.com +wadsam.com +wakteldjazair.com +walesonline.co.uk +wam.ae +washingtonpost.com +wattonandswaffhamtimes.co.uk +waveneyadvertiser24.co.uk +web.kbcalgerie.tv +webwereld.nl +wharf.co.uk +whitecourtstar.com +whtimes.co.uk +wijchennieuws.nl +wikileaks.org +windsor.ctvnews.ca +windsorstar.com +winnipeg.ctvnews.ca +winnipegsun.com +wired.com +wisbechstandard.co.uk +woodstocksentinelreview.com +wsj.com +www3.nhk.or.jp +wymondhamandattleboroughmercury.co.uk +xinhuanet.com +yarmouthadvertiser24.co.uk +yemen-nn.com +yenisafak.com +yle.fi +ynet.co.il +ynetnews.com +yomiuri.co.jp +yonhapnews.co.kr +yonhapnews.feedsportal.com +ypgrojava.com +zaman.com.tr +zamanarabic.com +zamanfrance.fr +zamankurdi.com +zh.radiovaticana.va +zwollenieuws.nl + diff --git a/get-wget-lua.sh b/get-wget-lua.sh new file mode 100755 index 0000000..9ce79e6 --- /dev/null +++ b/get-wget-lua.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +# This script clones and compiles wget-lua. +# + +# first, try to detect gnutls or openssl +CONFIGURE_SSL_OPT="" +if builtin type -p pkg-config &>/dev/null +then + if pkg-config gnutls + then + echo "Compiling wget with GnuTLS." + CONFIGURE_SSL_OPT="--with-ssl=gnutls" + elif pkg-config openssl + then + echo "Compiling wget with OpenSSL." + CONFIGURE_SSL_OPT="--with-ssl=openssl" + fi +fi + +if ! zstd --version | grep -q 1.4.4 +then + echo "Need version 1.4.4 of libzstd-dev and zstd" + exit 1 +fi + +rm -rf get-wget-lua.tmp/ +mkdir -p get-wget-lua.tmp + +cd get-wget-lua.tmp + +git clone https://github.com/archiveteam/wget-lua.git + +cd wget-lua +git checkout v1.20.3-at + +#echo -n 1.20.3-at-lua | tee ./.version ./.tarball-version > /dev/null + +if ./bootstrap && ./configure $CONFIGURE_SSL_OPT --disable-nls && make && src/wget -V | grep -q lua +then + cp src/wget ../../wget-at + cd ../../ + echo + echo + echo "###################################################################" + echo + echo "wget-lua successfully built." + echo + ./wget-at --help | grep -iE "gnu|warc|lua" + rm -rf get-wget-lua.tmp + exit 0 +else + echo + echo "wget-lua not successfully built." + echo + exit 1 +fi diff --git a/ignore-patterns.txt b/ignore-patterns.txt new file mode 100644 index 0000000..12b7039 --- /dev/null +++ b/ignore-patterns.txt @@ -0,0 +1,21 @@ +[%?&]ver=[0-9a-zA-Z%.]*%.16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9] +[%?&]ver=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9] +[%?&]t=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ +[%?&]t=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]%.[0-9]+$ +[%?&]hash=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ +%?16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ +%?16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ +%?6[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ +%?v=[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ +;extid=[0-9a-f]+$ +[%?&;]_flowexecutionkey= +[%?&;]sid= +[%?&;]cid= +[%?&;]jsessionid= +[%?&;]script_case_session= +[%?&;]Dilid= +[%?&;][pP][hH][pP][sS][eE][sS][sS][iI][dD]= +[%?&;]wtd= +[%?&;]nonce= +[%?&;]rnd= +^https?://[^/]+/index%.php%?s= diff --git a/page-requisite-patterns.txt b/page-requisite-patterns.txt new file mode 100644 index 0000000..f519342 --- /dev/null +++ b/page-requisite-patterns.txt @@ -0,0 +1,17 @@ +%.apng +%.avif +%.gif +%.jpe?g +%.jfif +%.pjpeg +%.pjp +%.png +%.svg +%.webp +%.bmp +%.ico +%.cur +%.tif +%.tiff +%.js +%.css diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..d05b1bb --- /dev/null +++ b/pipeline.py @@ -0,0 +1,425 @@ +# encoding=utf8 +import datetime +from distutils.version import StrictVersion +import hashlib +import json +import os +import random +import shutil +import socket +import subprocess +import sys +import threading +import time +import string +import sys + +if sys.version_info[0] < 3: + from urllib import unquote + from urlparser import parse_qs +else: + from urllib.parse import unquote, parse_qs + +import requests +import seesaw +from seesaw.config import realize, NumberConfigValue +from seesaw.externalprocess import WgetDownload +from seesaw.item import ItemInterpolation, ItemValue +from seesaw.pipeline import Pipeline +from seesaw.project import Project +from seesaw.task import SimpleTask, LimitConcurrent +from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \ + UploadWithTracker, SendDoneToTracker +from seesaw.util import find_executable +import zstandard + +if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): + raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') + +LOCK = threading.Lock() + + +########################################################################### +# Find a useful Wget+Lua executable. +# +# WGET_AT will be set to the first path that +# 1. does not crash with --version, and +# 2. prints the required version string + +WGET_AT = find_executable( + 'Wget+AT', + [ + 'GNU Wget 1.20.3-at.20211001.01' + ], + [ + './wget-at', + '/home/warrior/data/wget-at' + ] +) + +if not WGET_AT: + raise Exception('No usable Wget+At found.') + + +########################################################################### +# The version number of this pipeline definition. +# +# Update this each time you make a non-cosmetic change. +# It will be added to the WARC files and reported to the tracker. +VERSION = '20220423.01' +#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' +TRACKER_ID = 'urls' +TRACKER_HOST = 'legacy-api.arpa.li' +MULTI_ITEM_SIZE = 40 +MAX_DUPES_LIST_SIZE = 10000 + +########################################################################### +# This section defines project-specific tasks. +# +# Simple tasks (tasks that do not need any concurrency) are based on the +# SimpleTask class and have a process(item) method that is called for +# each item. +class CheckIP(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'CheckIP') + self._counter = 0 + + def process(self, item): + # NEW for 2014! Check if we are behind firewall/proxy + + if self._counter <= 0: + item.log_output('Checking IP address.') + ip_set = set() + + ip_set.add(socket.gethostbyname('twitter.com')) + #ip_set.add(socket.gethostbyname('facebook.com')) + ip_set.add(socket.gethostbyname('youtube.com')) + ip_set.add(socket.gethostbyname('microsoft.com')) + ip_set.add(socket.gethostbyname('icanhas.cheezburger.com')) + ip_set.add(socket.gethostbyname('archiveteam.org')) + + if len(ip_set) != 5: + item.log_output('Got IP addresses: {0}'.format(ip_set)) + item.log_output( + 'Are you behind a firewall/proxy? That is a big no-no!') + raise Exception( + 'Are you behind a firewall/proxy? That is a big no-no!') + + # Check only occasionally + if self._counter <= 0: + self._counter = 10 + else: + self._counter -= 1 + + +class CheckRequirements(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'CheckRequirements') + self._checked = False + + def process(self, item): + if not self._checked: + assert shutil.which('pdftohtml') is not None + self._checked = True + + +class PrepareDirectories(SimpleTask): + def __init__(self, warc_prefix): + SimpleTask.__init__(self, 'PrepareDirectories') + self.warc_prefix = warc_prefix + + def process(self, item): + item_name = item['item_name'] + item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest() + escaped_item_name = item_name_hash + dirname = '/'.join((item['data_dir'], escaped_item_name)) + + if os.path.isdir(dirname): + shutil.rmtree(dirname) + + os.makedirs(dirname) + + item['item_dir'] = dirname + item['warc_file_base'] = '-'.join([ + self.warc_prefix, + item_name_hash, + time.strftime('%Y%m%d-%H%M%S') + ]) + + if not os.path.isfile('duplicate-urls.txt'): + open('duplicate-urls.txt', 'w').close() + + open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close() + open('%(item_dir)s/%(warc_file_base)s_bad-urls.txt' % item, 'w').close() + open('%(item_dir)s/%(warc_file_base)s_duplicate-urls.txt' % item, 'w').close() + + +class MoveFiles(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'MoveFiles') + + def process(self, item): + os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, + '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item) + + shutil.rmtree('%(item_dir)s' % item) + + +class SetBadUrls(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'SetBadUrls') + + def unquote_url(self, url): + temp = unquote(url) + while url != temp: + url = temp + temp = unquote(url) + return url + + def process(self, item): + item['item_name_original'] = item['item_name'] + items = item['item_name'].split('\0') + items_lower = [self.unquote_url(url).strip().lower() for url in item['item_urls']] + with open('%(item_dir)s/%(warc_file_base)s_bad-urls.txt' % item, 'r') as f: + for url in { + self.unquote_url(url).strip().lower() for url in f + }: + index = items_lower.index(url) + items.pop(index) + items_lower.pop(index) + item['item_name'] = '\0'.join(items) + + +class SetDuplicateUrls(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'SetNewDuplicates') + + def process(self, item): + with LOCK: + self._process(item) + + def _process(self, item): + with open('duplicate-urls.txt', 'r') as f: + duplicates = {s.strip() for s in f} + with open('%(item_dir)s/%(warc_file_base)s_duplicate-urls.txt' % item, 'r') as f: + for url in f: + duplicates.add(url.strip()) + with open('duplicate-urls.txt', 'w') as f: + # choose randomly, to cycle periodically popular URLs + duplicates = list(duplicates) + random.shuffle(duplicates) + f.write('\n'.join(duplicates[:MAX_DUPES_LIST_SIZE])) + + +class MaybeSendDoneToTracker(SendDoneToTracker): + def enqueue(self, item): + if len(item['item_name']) == 0: + return self.complete_item(item) + return super(MaybeSendDoneToTracker, self).enqueue(item) + + +def get_hash(filename): + with open(filename, 'rb') as in_file: + return hashlib.sha1(in_file.read()).hexdigest() + +CWD = os.getcwd() +PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py')) +LUA_SHA1 = get_hash(os.path.join(CWD, 'urls.lua')) + +def stats_id_function(item): + d = { + 'pipeline_hash': PIPELINE_SHA1, + 'lua_hash': LUA_SHA1, + 'python_version': sys.version, + } + + return d + + +class ZstdDict(object): + created = 0 + data = None + + @classmethod + def get_dict(cls): + if cls.data is not None and time.time() - cls.created < 1800: + return cls.data + response = requests.get( + 'https://legacy-api.arpa.li/dictionary', + params={ + 'project': TRACKER_ID + } + ) + response.raise_for_status() + response = response.json() + if cls.data is not None and response['id'] == cls.data['id']: + cls.created = time.time() + return cls.data + print('Downloading latest dictionary.') + response_dict = requests.get(response['url']) + response_dict.raise_for_status() + raw_data = response_dict.content + if hashlib.sha256(raw_data).hexdigest() != response['sha256']: + raise ValueError('Hash of downloaded dictionary does not match.') + if raw_data[:4] == b'\x28\xB5\x2F\xFD': + raw_data = zstandard.ZstdDecompressor().decompress(raw_data) + cls.data = { + 'id': response['id'], + 'dict': raw_data + } + cls.created = time.time() + return cls.data + + +class WgetArgs(object): + def realize(self, item): + with open('user-agents.txt', 'r') as f: + USER_AGENT = random.choice(list(f)).strip() + wget_args = [ + 'timeout', '1000', + WGET_AT, + '-U', USER_AGENT, + '-v', + '--content-on-error', + '--lua-script', 'urls.lua', + '-o', ItemInterpolation('%(item_dir)s/wget.log'), + #'--no-check-certificate', + '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), + '--truncate-output', + '-e', 'robots=off', + '--rotate-dns', + '--recursive', '--level=inf', + '--no-parent', + '--timeout', '10', + '--tries', '2', + '--span-hosts', + '--page-requisites', + '--waitretry', '0', + '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), + '--warc-header', 'operator: Archive Team', + '--warc-header', 'x-wget-at-project-version: ' + VERSION, + '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, + '--warc-dedup-url-agnostic', + '--warc-compression-use-zstd', + '--warc-zstd-dict-no-include', + '--header', 'Connection: keep-alive', + '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8' + ] + + dict_data = ZstdDict.get_dict() + with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: + f.write(dict_data['dict']) + item['dict_id'] = dict_data['id'] + item['dict_project'] = TRACKER_ID + wget_args.extend([ + '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), + ]) + + item['item_name'] = '\0'.join([ + item_name for item_name in item['item_name'].split('\0') + if (item_name.startswith('custom:') and '&url=' in item_name) \ + or item_name.startswith('http://') \ + or item_name.startswith('https://') \ + ]) + + item['item_name_newline'] = item['item_name'].replace('\0', '\n') + item_urls = [] + custom_items = {} + + for item_name in item['item_name'].split('\0'): + wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) + wget_args.append('item-name://'+item_name) + if item_name.startswith('custom:'): + data = parse_qs(item_name.split(':', 1)[1]) + for k, v in data.items(): + if len(v) == 1: + data[k] = v[0] + url = data['url'] + custom_items[url.lower()] = data + else: + url = item_name + item_urls.append(url) + wget_args.append(url) + + item['item_urls'] = item_urls + item['custom_items'] = json.dumps(custom_items) + + if 'bind_address' in globals(): + wget_args.extend(['--bind-address', globals()['bind_address']]) + print('') + print('*** Wget will bind address at {0} ***'.format( + globals()['bind_address'])) + print('') + + return realize(wget_args, item) + +########################################################################### +# Initialize the project. +# +# This will be shown in the warrior management panel. The logo should not +# be too big. The deadline is optional. +project = Project( + title = 'URLs', + project_html = ''' + +

Archiving sets of discovered outlinks. · Leaderboard

+ ''' +) + +pipeline = Pipeline( + CheckIP(), + CheckRequirements(), + GetItemFromTracker('https://{}/{}/multi={}/' + .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE), + downloader, VERSION), + PrepareDirectories(warc_prefix='urls'), + WgetDownload( + WgetArgs(), + max_tries=1, + accept_on_exit_code=[0, 4, 8], + env={ + 'item_dir': ItemValue('item_dir'), + 'item_name': ItemValue('item_name_newline'), + 'custom_items': ItemValue('custom_items'), + 'warc_file_base': ItemValue('warc_file_base') + } + ), + SetBadUrls(), + SetDuplicateUrls(), + PrepareStatsForTracker( + defaults={'downloader': downloader, 'version': VERSION}, + file_groups={ + 'data': [ + ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst') + ] + }, + id_function=stats_id_function, + ), + MoveFiles(), + LimitConcurrent(NumberConfigValue(min=1, max=20, default='2', + name='shared:rsync_threads', title='Rsync threads', + description='The maximum number of concurrent uploads.'), + UploadWithTracker( + 'https://%s/%s' % (TRACKER_HOST, TRACKER_ID), + downloader=downloader, + version=VERSION, + files=[ + ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst') + ], + rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), + rsync_extra_args=[ + '--recursive', + '--partial', + '--partial-dir', '.rsync-tmp', + '--min-size', '1', + '--no-compress', + '--compress-level', '0' + ] + ), + ), + MaybeSendDoneToTracker( + tracker_url='https://%s/%s' % (TRACKER_HOST, TRACKER_ID), + stats=ItemValue('stats') + ) +) + diff --git a/urls.lua b/urls.lua new file mode 100644 index 0000000..e95b1fc --- /dev/null +++ b/urls.lua @@ -0,0 +1,942 @@ +local urlparse = require("socket.url") +local http = require("socket.http") +JSON = (loadfile "JSON.lua")() + +local item_dir = os.getenv("item_dir") +local item_name = os.getenv("item_name") +local custom_items = os.getenv("custom_items") +local warc_file_base = os.getenv("warc_file_base") + +local url_count = 0 +local downloaded = {} +local abortgrab = false +local exit_url = false +local min_dedup_mb = 5 + +local timestamp = nil + +if urlparse == nil or http == nil then + io.stdout:write("socket not corrently installed.\n") + io.stdout:flush() + abortgrab = true +end + +local urls = {} +for url in string.gmatch(item_name, "([^\n]+)") do + urls[string.lower(url)] = true +end + +local urls_settings = JSON:decode(custom_items) +for k, _ in pairs(urls_settings) do + urls[string.lower(k)] = true +end + +local status_code = nil + +local redirect_urls = {} +local visited_urls = {} +local ids_to_ignore = {} +for _, lengths in pairs({{8, 4, 4, 4, 12}, {8, 4, 4, 12}}) do + local uuid = "" + for _, i in pairs(lengths) do + for j=1,i do + uuid = uuid .. "[0-9a-fA-F]" + end + if i ~= 12 then + uuid = uuid .. "%-" + end + end + ids_to_ignore[uuid] = true +end +local to_ignore = "" +for i=1,9 do + to_ignore = to_ignore .. "[0-9]" +end +ids_to_ignore["%?" .. to_ignore .. "$"] = true +ids_to_ignore["%?" .. to_ignore .. "[0-9]$"] = true +ids_to_ignore[to_ignore .. "[0-9]%.[0-9][0-9][0-9][0-9]$"] = true +to_ignore = "" +for i=1,50 do + to_ignore = to_ignore .. "[0-9a-zA-Z]" +end +ids_to_ignore[to_ignore .. "%-[0-9][0-9][0-9][0-9][0-9]"] = true +ids_to_ignore["[0-9a-zA-Z%-_]!%-?[0-9]"] = true +to_ignore = "" +for i=1,32 do + to_ignore = to_ignore .. "[0-9a-fA-F]" +end +ids_to_ignore["[^0-9a-fA-F]" .. to_ignore .. "[^0-9a-fA-F]"] = true +ids_to_ignore["[^0-9a-fA-F]" .. to_ignore .. "$"] = true + +local current_url = nil +local current_settings = nil +local bad_urls = {} +local queued_urls = {} +local bad_params = {} +local bad_patterns = {} +local ignore_patterns = {} +local page_requisite_patterns = {} +local duplicate_urls = {} +local extract_outlinks_patterns = {} +local item_first_url = nil +local redirect_domains = {} +local checked_domains = {} + +local parenturl_uuid = nil +local parenturl_requisite = nil + +local dupes_file = io.open("duplicate-urls.txt", "r") +for url in dupes_file:lines() do + duplicate_urls[url] = true +end +dupes_file:close() + +local bad_params_file = io.open("bad-params.txt", "r") +for param in bad_params_file:lines() do + local param = string.gsub( + param, "([a-zA-Z])", + function(c) + return "[" .. string.lower(c) .. string.upper(c) .. "]" + end + ) + table.insert(bad_params, param) +end +bad_params_file:close() + +local bad_patterns_file = io.open("bad-patterns.txt", "r") +for pattern in bad_patterns_file:lines() do + table.insert(bad_patterns, pattern) +end +bad_patterns_file:close() + +local ignore_patterns_file = io.open("ignore-patterns.txt", "r") +for pattern in ignore_patterns_file:lines() do + table.insert(ignore_patterns, pattern) +end +ignore_patterns_file:close() + +local page_requisite_patterns_file = io.open("page-requisite-patterns.txt", "r") +for pattern in page_requisite_patterns_file:lines() do + table.insert(page_requisite_patterns, pattern) +end +page_requisite_patterns_file:close() + +local extract_outlinks_patterns_file = io.open("extract-outlinks-patterns.txt", "r") +for pattern in extract_outlinks_patterns_file:lines() do + extract_outlinks_patterns[pattern] = true +end +extract_outlinks_patterns_file:close() + +read_file = function(file, bytes) + if not bytes then + bytes = "*all" + end + if file then + local f = assert(io.open(file)) + local data = f:read(bytes) + f:close() + if not data then + data = "" + end + return data + else + return "" + end +end + +table_length = function(t) + local count = 0 + for _ in pairs(t) do + count = count + 1 + end + return count +end + +check_domain_outlinks = function(url, target) + local parent = string.match(url, "^https?://([^/]+)") + while parent do + if (not target and extract_outlinks_patterns[parent]) + or (target and parent == target) then + return parent + end + parent = string.match(parent, "^[^%.]+%.(.+)$") + end + return false +end + +bad_code = function(status_code) + return status_code ~= 200 + and status_code ~= 301 + and status_code ~= 302 + and status_code ~= 303 + and status_code ~= 307 + and status_code ~= 308 + and status_code ~= 404 + and status_code ~= 410 +end + +find_path_loop = function(url, max_repetitions) + local tested = {} + for s in string.gmatch(urlparse.unescape(url), "([^/]+)") do + s = string.lower(s) + if not tested[s] then + if s == "" then + tested[s] = -2 + else + tested[s] = 0 + end + end + tested[s] = tested[s] + 1 + if tested[s] == max_repetitions then + return true + end + end + return false +end + +percent_encode_url = function(url) + temp = "" + for c in string.gmatch(url, "(.)") do + local b = string.byte(c) + if b < 32 or b > 126 then + c = string.format("%%%02X", b) + end + temp = temp .. c + end + return temp +end + +queue_url = function(url, withcustom) + if not url then + return nil + end + queue_new_urls(url) + if not string.match(url, "^https?://[^/]+%.") then + return nil + end +--local original = url + load_setting_depth = function(s) + n = tonumber(current_settings[s]) + if n == nil then + n = 0 + end + return n - 1 + end + url = string.gsub(url, "'%s*%+%s*'", "") + url = percent_encode_url(url) + url = string.match(url, "^([^{]+)") + url = string.match(url, "^([^<]+)") + url = string.match(url, "^([^\\]+)") + if current_settings and current_settings["all"] and withcustom then + local depth = load_setting_depth("depth") + local keep_random = load_setting_depth("keep_random") + local keep_all = load_setting_depth("keep_all") + local any_domain = load_setting_depth("any_domain") + if depth >= 0 then + local random = current_settings["random"] + local all = current_settings["all"] + if keep_random < 0 or random == "" then + random = nil + keep_random = nil + end + if keep_all < 0 or all == 0 then + all = nil + keep_all = nil + end + if any_domain <= 0 then + any_domain = nil + end + local settings = { + depth=depth, + all=all, + keep_all=keep_all, + random=random, + keep_random=keep_random, + url=url, + any_domain=any_domain + } + url = "custom:" + for _, k in pairs( + {"all", "any_domain", "depth", "keep_all", "keep_random", "random", "url"} + ) do + local v = settings[k] + if v ~= nil then + url = url .. k .. "=" .. urlparse.escape(tostring(v)) .. "&" + end + end + url = string.sub(url, 1, -2) + end + end + if not duplicate_urls[url] and not queued_urls[url] then + if find_path_loop(url, 2) then + return false + end +--print("queuing",original, url) + queued_urls[url] = true + end +end + +queue_monthly_url = function(url) + local random_s = os.date("%Y%m", timestamp) + url = percent_encode_url(url) + queued_urls["custom:random=" .. random_s .. "&url=" .. urlparse.escape(tostring(url))] = true +end + +remove_param = function(url, param_pattern) + local newurl = url + repeat + url = newurl + newurl = string.gsub(url, "([%?&;])" .. param_pattern .. "=[^%?&;]*[%?&;]?", "%1") + until newurl == url + return string.match(newurl, "^(.-)[%?&;]?$") +end + +queue_new_urls = function(url) + if not url then + return nil + end + local newurl = string.gsub(url, "([%?&;])[aA][mM][pP];", "%1") + if url == current_url then + if newurl ~= url then + queue_url(newurl) + end + end + for _, param_pattern in pairs(bad_params) do + newurl = remove_param(newurl, param_pattern) + end + if newurl ~= url then + queue_url(newurl) + end + newurl = string.match(newurl, "^([^%?&]+)") + if newurl ~= url then + queue_url(newurl) + end + url = string.gsub(url, """, '"') + url = string.gsub(url, "&", "&") + for newurl in string.gmatch(url, '([^"\\]+)') do + if newurl ~= url then + queue_url(newurl) + end + end +end + +report_bad_url = function(url) + if current_url ~= nil then + bad_urls[current_url] = true + else + bad_urls[string.lower(url)] = true + end +end + +strip_url = function(url) + url = string.match(url, "^https?://(.+)$") + newurl = string.match(url, "^www%.(.+)$") + if newurl then + url = newurl + end + return url +end + +wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) + local url = urlpos["url"]["url"] + local parenturl = parent["url"] + local extract_page_requisites = false + + local current_settings_all = current_settings and current_settings["all"] + local current_settings_any_domain = current_settings and current_settings["any_domain"] + + --queue_monthly_url(string.match(url, "^(https?://[^/]+)") .. "/") + + if redirect_urls[parenturl] and not ( + status_code == 300 and string.match(parenturl, "^https?://[^/]*feb%-web%.ru/") + ) then + return true + end + + if find_path_loop(url, 2) then + return false + end + + local _, count = string.gsub(url, "[/%?]", "") + if count >= 16 then + return false + end + + for _, extension in pairs({ + "pdf", + "doc[mx]?", + "xls[mx]?", + "ppt[mx]?", + "zip", + "odt", + "odm", + "ods", + "odp", + "xml", + "json", + "torrent" + }) do + if string.match(parenturl, "%." .. extension .. "$") + or string.match(parenturl, "%." .. extension .. "[^a-z0-9A-Z]") + or string.match(parenturl, "%." .. string.upper(extension) .. "$") + or string.match(parenturl, "%." .. string.upper(extension) .. "[^a-z0-9A-Z]") then + return false + end + if string.match(url, "%." .. extension .. "$") + or string.match(url, "%." .. extension .. "[^a-z0-9A-Z]") + or string.match(url, "%." .. string.upper(extension) .. "$") + or string.match(url, "%." .. string.upper(extension) .. "[^a-z0-9A-Z]") then + queue_url(url) + return false + end + end + + local domain_match = checked_domains[item_first_url] + if not domain_match then + domain_match = check_domain_outlinks(item_first_url) + if not domain_match then + domain_match = "none" + end + checked_domains[item_first_url] = domain_match + end + if domain_match ~= "none" then + extract_page_requisites = true + local newurl_domain = string.match(url, "^https?://([^/]+)") + local to_queue = true + for domain, _ in pairs(redirect_domains) do + if check_domain_outlinks(url, domain) then + to_queue = false + break + end + end + if to_queue then + queue_url(url) + return false + end + end + + --[[if not extract_page_requisites then + return false + end]] + + if (status_code < 200 or status_code >= 300 or not verdict) + and not current_settings_all then + return false + end + + --[[if string.len(url) == string.len(parenturl) then + local good_url = false + local index1, index2 + temp_url = string.match(url, "^https?://(.+)$") + temp_parenturl = string.match(parenturl, "^https?://(.+)$") + local start_index = 1 + repeat + index1 = string.find(temp_url, "/", start_index) + index2 = string.find(temp_parenturl, "/", start_index) + if index1 ~= index2 then + good_url = true + break + end + if index1 then + start_index = index1 + 1 + end + until not index1 or not index2 + if not good_url then + return false + end + end]] + + if parenturl_uuid == nil then + parenturl_uuid = false + for old_parent_url, _ in pairs(visited_urls) do + for id_to_ignore, _ in pairs(ids_to_ignore) do + if string.match(old_parent_url, id_to_ignore) then + parenturl_uuid = true + break + end + end + if parenturl_uuid then + break + end + end + end + if parenturl_uuid then + for id_to_ignore, _ in pairs(ids_to_ignore) do + if string.match(url, id_to_ignore) and not current_settings_all then + return false + end + end + end + + if urlpos["link_refresh_p"] ~= 0 then + queue_url(url) + return false + end + + if parenturl_requisite == nil then + parenturl_requisite = false + for _, pattern in pairs(page_requisite_patterns) do + for old_parent_url, _ in pairs(visited_urls) do + if string.match(old_parent_url, pattern) then + parenturl_requisite = true + break + end + end + if parenturl_requisite then + break + end + end + end + if parenturl_requisite and not current_settings_all then + return false + end + + if urlpos["link_inline_p"] ~= 0 then + queue_url(url) + return false + end + + local current_host = string.match(urlpos["url"]["host"], "([^%.]+%.[^%.]+)$") + local first_parent_host = string.match(parent["host"], "([^%.]+%.[^%.]+)$") + + if current_url then + first_parent_host = string.match(current_url .. "/", "^https?://[^/]-([^/%.]+%.[^/%.]+)/") + end + + if current_settings_all and ( + current_settings_any_domain + or first_parent_host == current_host + ) then + queue_url(url, true) + return false + end + + --[[for old_parent_url, _ in pairs(visited_urls) do + for _, pattern in pairs(page_requisite_patterns) do + if string.match(old_parent_url, pattern) then + return false + end + end + end + + for _, pattern in pairs(page_requisite_patterns) do + if string.match(url, pattern) then + queue_url(url) + return false + end + end]] +end + +wget.callbacks.get_urls = function(file, url, is_css, iri) + local html = nil + + if url then + downloaded[url] = true + end + + local function check(url, headers) + local url = string.match(url, "^([^#]+)") + url = string.gsub(url, "&", "&") + queue_url(url) + end + + local function checknewurl(newurl, headers) + if string.match(newurl, "^#") then + return nil + end + if string.match(newurl, "\\[uU]002[fF]") then + return checknewurl(string.gsub(newurl, "\\[uU]002[fF]", "/"), headers) + end + if string.match(newurl, "^https?:////") then + check(string.gsub(newurl, ":////", "://"), headers) + elseif string.match(newurl, "^https?://") then + check(newurl, headers) + elseif string.match(newurl, "^https?:\\/\\?/") then + check(string.gsub(newurl, "\\", ""), headers) + elseif not url then + return nil + elseif string.match(newurl, "^\\/") then + checknewurl(string.gsub(newurl, "\\", ""), headers) + elseif string.match(newurl, "^//") then + check(urlparse.absolute(url, newurl), headers) + elseif string.match(newurl, "^/") then + check(urlparse.absolute(url, newurl), headers) + elseif string.match(newurl, "^%.%./") then + if string.match(url, "^https?://[^/]+/[^/]+/") then + check(urlparse.absolute(url, newurl), headers) + else + checknewurl(string.match(newurl, "^%.%.(/.+)$"), headers) + end + elseif string.match(newurl, "^%./") then + check(urlparse.absolute(url, newurl), headers) + end + end + + local function checknewshorturl(newurl, headers) + if string.match(newurl, "^#") then + return nil + end + if url and string.match(newurl, "^%?") then + check(urlparse.absolute(url, newurl), headers) + elseif url and not (string.match(newurl, "^https?:\\?/\\?//?/?") + or string.match(newurl, "^[/\\]") + or string.match(newurl, "^%./") + or string.match(newurl, "^[jJ]ava[sS]cript:") + or string.match(newurl, "^[mM]ail[tT]o:") + or string.match(newurl, "^vine:") + or string.match(newurl, "^android%-app:") + or string.match(newurl, "^ios%-app:") + or string.match(newurl, "^%${")) then + check(urlparse.absolute(url, newurl), headers) + else + checknewurl(newurl, headers) + end + end + + if (status_code == 200 and current_settings and current_settings["deep_extract"]) + or not url then + html = read_file(file) + if not url then + html = string.gsub(html, " ", " ") + html = string.gsub(html, "<", "<") + html = string.gsub(html, ">", ">") + html = string.gsub(html, """, '"') + html = string.gsub(html, "'", "'") + html = string.gsub(html, "&#(%d+);", + function(n) + return string.char(n) + end + ) + html = string.gsub(html, "&#x(%d+);", + function(n) + return string.char(tonumber(n, 16)) + end + ) + local temp_html = string.gsub(html, "\n", "") + for _, remove in pairs({"", "
", "]*>"}) do + if remove ~= "" then + temp_html = string.gsub(temp_html, remove, "") + end + for newurl in string.gmatch(temp_html, "(https?://[^%s<>#\"'\\`{})%]]+)") do + while string.match(newurl, "[%.&,!;]$") do + newurl = string.match(newurl, "^(.+).$") + end + check(newurl) + end + end + end + for newurl in string.gmatch(html, "[^%-][hH][rR][eE][fF]='([^']+)'") do + checknewshorturl(newurl) + end + for newurl in string.gmatch(html, '[^%-][hH][rR][eE][fF]="([^"]+)"') do + checknewshorturl(newurl) + end + for newurl in string.gmatch(string.gsub(html, "&[qQ][uU][oO][tT];", '"'), '"(https?://[^"]+)') do + checknewurl(newurl) + end + for newurl in string.gmatch(string.gsub(html, "'", "'"), "'(https?://[^']+)") do + checknewurl(newurl) + end + if url then + for newurl in string.gmatch(html, ">%s*([^<%s]+)") do + checknewurl(newurl) + end + end + --[[for newurl in string.gmatch(html, "%(([^%)]+)%)") do + checknewurl(newurl) + end]] + elseif string.match(url, "^https?://[^/]+/.*[^a-z0-9A-Z][pP][dD][fF]$") + or string.match(url, "^https?://[^/]+/.*[^a-z0-9A-Z][pP][dD][fF][^a-z0-9A-Z]") + or string.match(read_file(file, 4), "%%[pP][dD][fF]") then + io.stdout:write("Extracting links from PDF.\n") + io.stdout:flush() + local temp_file = file .. "-html.html" + local check_file = io.open(temp_file) + if check_file then + check_file:close() + os.remove(temp_file) + end + os.execute("pdftohtml -nodrm -hidden -i -s -q " .. file) + check_file = io.open(temp_file) + if check_file then + check_file:close() + local temp_length = table_length(queued_urls) + wget.callbacks.get_urls(temp_file, nil, nil, nil) + io.stdout:write("Found " .. tostring(table_length(queued_urls)-temp_length) .. " URLs.\n") + io.stdout:flush() + os.remove(temp_file) + else + io.stdout:write("Not a PDF.\n") + io.stdout:flush() + end + end +end + +wget.callbacks.write_to_warc = function(url, http_stat) + local url_lower = string.lower(url["url"]) + if urls[url_lower] then + current_url = url_lower + current_settings = urls_settings[url_lower] + end + if current_settings and not current_settings["random"] then + queue_url(url["url"]) + return false + end + if bad_code(http_stat["statcode"]) then + return false + elseif http_stat["statcode"] >= 300 and http_stat["statcode"] <= 399 then + local newloc = urlparse.absolute(url["url"], http_stat["newloc"]) + if string.match(newloc, "^https?://[^/]*google%.com/sorry") + or string.match(newloc, "^https?://[^/]*google%.com/[sS]ervice[lL]ogin") + or string.match(newloc, "^https?://consent%.youtube%.com/") + or string.match(newloc, "^https?://consent%.google%.com/") + or string.match(newloc, "^https?://misuse%.ncbi%.nlm%.nih%.gov/") + or string.match(newloc, "^https?://myprivacy%.dpgmedia%.nl/") + or string.match(newloc, "^https?://idp%.springer%.com/authorize%?") + or string.match(newloc, "^https?://[^/]*instagram%.com/accounts/") then + report_bad_url(url["url"]) + exit_url = true + return false + end + return true + elseif http_stat["statcode"] ~= 200 then + return true + end + if true then + return true + end + if http_stat["len"] > min_dedup_mb * 1024 * 1024 then + io.stdout:write("Data larger than " .. tostring(min_dedup_mb) .. " MB. Checking with Wayback Machine.\n") + io.stdout:flush() + while true do + local body, code, headers, status = http.request( + "https://web.archive.org/__wb/calendarcaptures/2" + .. "?url=" .. urlparse.escape(url["url"]) + .. "&date=202" + ) + if code ~= 200 then + io.stdout:write("Got " .. tostring(code) .. " from the Wayback Machine.\n") + io.stdout:flush() + os.execute("sleep 10") + else + data = JSON:decode(body) + if not data["items"] or not data["colls"] then + return true + end + for _, item in pairs(data["items"]) do + if item[2] == 200 then + local coll_id = item[3] + 1 + if not coll_id then + io.stdout:write("Could get coll ID.\n") + io.stdout:flush() + end + local collections = data["colls"][coll_id] + if not collections then + io.stdout:write("Could not get collections.\n") + io.stdout:flush() + end + for _, collection in pairs(collections) do + if collection == "archivebot" + or string.find(collection, "archiveteam") then + io.stdout:write("Archive Team got this URL before.\n") + return false + end + end + end + end + break + end + end + end + return true +end + +wget.callbacks.httploop_result = function(url, err, http_stat) + status_code = http_stat["statcode"] + + parenturl_uuid = nil + parenturl_requisite = nil + + local url_lower = string.lower(url["url"]) + if urls[url_lower] then + current_url = url_lower + current_settings = urls_settings[url_lower] + end + + if not timestamp then + local body, code, headers, status = http.request("https://legacy-api.arpa.li/now") + assert(code == 200) + timestamp = tonumber(string.match(body, "^([0-9]+)")) + end + + + if status_code ~= 0 then + local base_url = string.match(url["url"], "^(https://[^/]+)") + if base_url then + for _, newurl in pairs({ + base_url .. "/robots.txt", + base_url .. "/favicon.ico", + base_url .. "/" + }) do + queue_monthly_url(newurl) + end + end + end + + url_count = url_count + 1 + io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. " \n") + io.stdout:flush() + + if redirect_domains["done"] then + redirect_domains = {} + redirect_urls = {} + visited_urls = {} + item_first_url = nil + end + redirect_domains[string.match(url["url"], "^https?://([^/]+)")] = true + if not item_first_url then + item_first_url = url["url"] + end + + visited_urls[url["url"]] = true + + if exit_url then + exit_url = false + return wget.actions.EXIT + end + + if status_code >= 300 and status_code <= 399 then + local newloc = urlparse.absolute(url["url"], http_stat["newloc"]) + redirect_urls[url["url"]] = true + --[[if strip_url(url["url"]) == strip_url(newloc) then + queued_urls[newloc] = true + return wget.actions.EXIT + end]] + if downloaded[newloc] then + return wget.actions.EXIT + elseif string.match(url["url"], "^https?://[^/]*telegram%.org/dl%?tme=") + or ( + string.match(newloc, "^https?://www%.(.+)") + or string.match(newloc, "^https?://(.+)") + ) == ( + string.match(url["url"], "^https?://www%.(.+)") + or string.match(url["url"], "^https?://(.+)") + ) + or status_code == 301 + or status_code == 308 then + queue_url(newloc) + return wget.actions.EXIT + end + else + redirect_domains["done"] = true + end + + if downloaded[url["url"]] then + report_bad_url(url["url"]) + return wget.actions.EXIT + end + + for _, pattern in pairs(ignore_patterns) do + if string.match(url["url"], pattern) then + return wget.actions.EXIT + end + end + + if status_code >= 200 and status_code <= 399 then + downloaded[url["url"]] = true + end + + if status_code >= 200 and status_code < 300 then + queue_new_urls(url["url"]) + end + + if bad_code(status_code) then + io.stdout:write("Server returned " .. http_stat.statcode .. " (" .. err .. ").\n") + io.stdout:flush() + report_bad_url(url["url"]) + return wget.actions.EXIT + end + + local sleep_time = 0 + + if sleep_time > 0.001 then + os.execute("sleep " .. sleep_time) + end + + return wget.actions.NOTHING +end + +wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time) + local function submit_backfeed(newurls) + local tries = 0 + local maxtries = 4 + while tries < maxtries do + local body, code, headers, status = http.request( + "https://legacy-api.arpa.li/backfeed/legacy/urls-glx7ansh4e17aii", + newurls .. "\0" + ) + print(body) + if code == 200 then + io.stdout:write("Submitted discovered URLs.\n") + io.stdout:flush() + break + end + io.stdout:write("Failed to submit discovered URLs." .. tostring(code) .. tostring(body) .. "\n") + io.stdout:flush() + os.execute("sleep " .. math.floor(math.pow(2, tries))) + tries = tries + 1 + end + if tries == maxtries then + abortgrab = true + end + end + + local newurls = nil + local is_bad = false + local count = 0 + local dup_urls = io.open(item_dir .. "/" .. warc_file_base .. "_duplicate-urls.txt", "w") + for url, _ in pairs(queued_urls) do + for _, pattern in pairs(bad_patterns) do + is_bad = string.match(url, pattern) + if is_bad then + io.stdout:write("Filtering out URL " .. url .. ".\n") + io.stdout:flush() + break + end + end + if not is_bad then + io.stdout:write("Queuing URL " .. url .. ".\n") + io.stdout:flush() + dup_urls:write(url .. "\n") + if newurls == nil then + newurls = url + else + newurls = newurls .. "\0" .. url + end + count = count + 1 + if count == 100 then + submit_backfeed(newurls) + newurls = nil + count = 0 + end + end + end + if newurls ~= nil then + submit_backfeed(newurls) + end + dup_urls:close() + + local file = io.open(item_dir .. "/" .. warc_file_base .. "_bad-urls.txt", "w") + for url, _ in pairs(bad_urls) do + file:write(url .. "\n") + end + file:close() +end + +wget.callbacks.before_exit = function(exit_status, exit_status_string) + if abortgrab then + return wget.exits.IO_FAIL + end + return exit_status +end + diff --git a/user-agents.txt b/user-agents.txt new file mode 100644 index 0000000..b78e5a4 --- /dev/null +++ b/user-agents.txt @@ -0,0 +1,381 @@ +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:43.0) Gecko/20100101 Firefox/43.0 SeaMonkey/2.40 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:56.0) Gecko/20100101 Firefox/56.0.4 Waterfox/56.0.4 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.3 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.4 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:65.0) Gecko/20100101 Firefox/65.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:43.0) Gecko/20100101 Firefox/43.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:43.0) Gecko/20100101 Firefox/43.0 SeaMonkey/2.40 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0 SeaMonkey/2.48 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:56.0) Gecko/20100101 Firefox/56.0.4 Waterfox/56.0.4 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.3 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.4 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:59.0.2) Gecko/20100101 Firefox/59.0.2 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:61.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:40.0) Gecko/20100101 Firefox/40.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.3 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0.4 Waterfox/56.0.4 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.3 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.4 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/99.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:65.0) Gecko/20100101 Firefox/65.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.2 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:40.0) Gecko/20100101 Firefox/40.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.8.3 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:44.0) Gecko/20100101 Firefox/44.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:49.0) Gecko/20100101 Firefox/49.0.2.1 Waterfox/49.0.2.1 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:41.0) Gecko/20100101 Firefox/41.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.1 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:56.0) Gecko/20100101 Firefox/56.0.1 Waterfox/56.0.1 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.102 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3639.1 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_29_81; rv:45.70.23) Gecko/20134284 Firefox/45.70.23 +Mozilla/5.0 (Macintosh; Intel Mac OS X 11.11; rv:51.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 9.3; rv:45.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 9.3; rv:45.0) Gecko/20100101 Firefox/59.0.2 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.11; rv:46.0) Gecko/20100101 Firefox/46.0 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.12; rv:46.0) Gecko/20100101 Firefox/46.0 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR7; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/G5 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR8; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/G5 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR9; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/G5 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR8; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450 +Mozilla/5.0 (Macintosh; PPC Mac OS X 10.8; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:65.0) Gecko/20100101 Firefox/65.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.11; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.11; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.11; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.11; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.12; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.12; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.12; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.85 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:20.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 IceDragon/40.1.1.18 Firefox/40.0.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0 Framafox/43.0.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0 SeaMonkey/2.40 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.63.16) Gecko/20175595 Firefox/45.63.16 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0 SeaMonkey/2.46 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.9.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.2 Lightning/5.4 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.3 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.4 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 Zotero/5.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Firefox/52.9 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.6.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.7.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.8.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.8.3 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.3 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180927 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.0.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.0.0a2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.0.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.1.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0 SeaMonkey/2.49.3 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0 IceDragon/58.0.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0 IceDragon/60.0.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.9) Gecko/20100101 Goanna/4.1 Firefox/60.9 PaleMoon/28.2.1 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0 IceDragon/61.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0 IceDragon/62.0.2 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:65.0) Gecko/20100101 Firefox/65.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:61.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Windows NT 10.0; Win64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:41.0) Gecko/20100101 Firefox/41.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0.4 Waterfox/43.0.4 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:46.0) Gecko/20100101 Firefox/46.0.1 Waterfox/46.0.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:51.0) Gecko/20100101 Firefox/51.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.0.4 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.5.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.5.2 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.7.2 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.7.4 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.8.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.9.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0.2 Waterfox/52.0.2 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.3 Firefox/52.9 PaleMoon/27.5.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.8.3 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.2 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.3 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.4 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180424 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180515 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180601 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180718 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180905 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 Basilisk/20180927 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.0.0.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.0.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.1.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0.1 Waterfox/54.0.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0.1 Waterfox/56.0.1 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0.4 Waterfox/56.0.4 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.3 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.4 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0; Waterfox) Gecko/20100101 Firefox/56.2.5 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Windows NT 10.0; rv:44.0) Gecko/20100101 Firefox/44.0.1 +Mozilla/5.0 (Windows NT 10.0; rv:45.0) Gecko/20100101 Firefox/45.0 +Mozilla/5.0 (Windows NT 10.0; rv:47.0) Gecko/20100101 Firefox/47.0 +Mozilla/5.0 (Windows NT 10.0; rv:49.0) Gecko/20100101 Firefox/49.0 +Mozilla/5.0 (Windows NT 10.0; rv:50.0) Gecko/20100101 Firefox/50.0 +Mozilla/5.0 (Windows NT 10.0; rv:51.0) Gecko/20100101 Firefox/51.0 +Mozilla/5.0 (Windows NT 10.0; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Windows NT 10.0; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.7.2 +Mozilla/5.0 (Windows NT 10.0; rv:52.0) Gecko/20100101 Firefox/52.0 Cyberfox/52.9.1 +Mozilla/5.0 (Windows NT 10.0; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.4 +Mozilla/5.0 (Windows NT 10.0; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.1 +Mozilla/5.0 (Windows NT 10.0; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.1a1 +Mozilla/5.0 (Windows NT 10.0; rv:52.9) Gecko/20100101 Goanna/3.4 Firefox/52.9 PaleMoon/27.9.3 +Mozilla/5.0 (Windows NT 10.0; rv:52.9) Gecko/20100101 Goanna/4.1 Firefox/52.9 PaleMoon/28.1.0 +Mozilla/5.0 (Windows NT 10.0; rv:53.0) Gecko/20100101 Firefox/53.0 +Mozilla/5.0 (Windows NT 10.0; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Windows NT 10.0; rv:56.0) Gecko/20100101 Firefox/56.0 +Mozilla/5.0 (Windows NT 10.0; rv:57.0) Gecko/20100101 Firefox/57.0 +Mozilla/5.0 (Windows NT 10.0; rv:58.0) Gecko/20100101 Firefox/58.0 +Mozilla/5.0 (Windows NT 10.0; rv:59.0) Gecko/20100101 Firefox/59.0 +Mozilla/5.0 (Windows NT 10.0; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 10.0; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Windows NT 10.0; rv:62.0) Gecko/20100101 Firefox/62.0 +Mozilla/5.0 (Windows NT 10.0; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Windows NT 4.0; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Windows NT 5.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0 +Mozilla/5.0 (Windows NT 5.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Windows NT 5.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0 +Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 +Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (Windows NT 6.1; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (X11; CrOS x86_64 11021.81.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.77 Chrome/70.0.3538.77 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0 +Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0 +Mozilla/5.0 (X11; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0 +Mozilla/5.0 (X11; OpenBSD amd64; rv:56.0) Gecko/20100101 Firefox/66.0 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0