From 0bec576ea8b96d9c9bfedbf7be56ad045533f9ff Mon Sep 17 00:00:00 2001 From: TheTechRobo <52163910+TheTechRobo@users.noreply.github.com> Date: Sun, 1 May 2022 18:51:51 -0400 Subject: [PATCH] WIP --- JSON.lua | 1053 ------------------------------- Makefile | 7 + bad-params.txt | 64 -- bad-patterns.txt | 33 - boilerplate.py | 1 + extract-outlinks-patterns.txt | 1112 --------------------------------- gmd.lua | 122 ++++ grab.lua | 1 + ignore-patterns.txt | 21 - page-requisite-patterns.txt | 17 - pipeline.py | 360 +++-------- urls.lua | 942 ---------------------------- user-agents.txt | 381 ----------- 13 files changed, 215 insertions(+), 3899 deletions(-) delete mode 100644 JSON.lua create mode 100644 Makefile delete mode 100644 bad-params.txt delete mode 100644 bad-patterns.txt create mode 100644 boilerplate.py delete mode 100644 extract-outlinks-patterns.txt create mode 100644 gmd.lua create mode 100644 grab.lua delete mode 100644 ignore-patterns.txt delete mode 100644 page-requisite-patterns.txt delete mode 100644 urls.lua delete mode 100644 user-agents.txt diff --git a/JSON.lua b/JSON.lua deleted file mode 100644 index 5f11425..0000000 --- a/JSON.lua +++ /dev/null @@ -1,1053 +0,0 @@ --- -*- coding: utf-8 -*- --- --- Simple JSON encoding and decoding in pure Lua. --- --- Copyright 2010-2014 Jeffrey Friedl --- http://regex.info/blog/ --- --- Latest version: http://regex.info/blog/lua/json --- --- This code is released under a Creative Commons CC-BY "Attribution" License: --- http://creativecommons.org/licenses/by/3.0/deed.en_US --- --- It can be used for any purpose so long as the copyright notice above, --- the web-page links above, and the 'AUTHOR_NOTE' string below are --- maintained. Enjoy. --- -local VERSION = 20141223.14 -- version history at end of file -local AUTHOR_NOTE = "-[ JSON.lua package by Jeffrey Friedl (http://regex.info/blog/lua/json) version 20141223.14 ]-" - --- --- The 'AUTHOR_NOTE' variable exists so that information about the source --- of the package is maintained even in compiled versions. It's also --- included in OBJDEF below mostly to quiet warnings about unused variables. --- -local OBJDEF = { - VERSION = VERSION, - AUTHOR_NOTE = AUTHOR_NOTE, -} - - --- --- Simple JSON encoding and decoding in pure Lua. --- http://www.json.org/ --- --- --- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines --- --- local lua_value = JSON:decode(raw_json_text) --- --- local raw_json_text = JSON:encode(lua_table_or_value) --- local pretty_json_text = JSON:encode_pretty(lua_table_or_value) -- "pretty printed" version for human readability --- --- --- --- DECODING (from a JSON string to a Lua table) --- --- --- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines --- --- local lua_value = JSON:decode(raw_json_text) --- --- If the JSON text is for an object or an array, e.g. --- { "what": "books", "count": 3 } --- or --- [ "Larry", "Curly", "Moe" ] --- --- the result is a Lua table, e.g. --- { what = "books", count = 3 } --- or --- { "Larry", "Curly", "Moe" } --- --- --- The encode and decode routines accept an optional second argument, --- "etc", which is not used during encoding or decoding, but upon error --- is passed along to error handlers. It can be of any type (including nil). --- --- --- --- ERROR HANDLING --- --- With most errors during decoding, this code calls --- --- JSON:onDecodeError(message, text, location, etc) --- --- with a message about the error, and if known, the JSON text being --- parsed and the byte count where the problem was discovered. You can --- replace the default JSON:onDecodeError() with your own function. --- --- The default onDecodeError() merely augments the message with data --- about the text and the location if known (and if a second 'etc' --- argument had been provided to decode(), its value is tacked onto the --- message as well), and then calls JSON.assert(), which itself defaults --- to Lua's built-in assert(), and can also be overridden. --- --- For example, in an Adobe Lightroom plugin, you might use something like --- --- function JSON:onDecodeError(message, text, location, etc) --- LrErrors.throwUserError("Internal Error: invalid JSON data") --- end --- --- or even just --- --- function JSON.assert(message) --- LrErrors.throwUserError("Internal Error: " .. message) --- end --- --- If JSON:decode() is passed a nil, this is called instead: --- --- JSON:onDecodeOfNilError(message, nil, nil, etc) --- --- and if JSON:decode() is passed HTML instead of JSON, this is called: --- --- JSON:onDecodeOfHTMLError(message, text, nil, etc) --- --- The use of the fourth 'etc' argument allows stronger coordination --- between decoding and error reporting, especially when you provide your --- own error-handling routines. Continuing with the the Adobe Lightroom --- plugin example: --- --- function JSON:onDecodeError(message, text, location, etc) --- local note = "Internal Error: invalid JSON data" --- if type(etc) = 'table' and etc.photo then --- note = note .. " while processing for " .. etc.photo:getFormattedMetadata('fileName') --- end --- LrErrors.throwUserError(note) --- end --- --- : --- : --- --- for i, photo in ipairs(photosToProcess) do --- : --- : --- local data = JSON:decode(someJsonText, { photo = photo }) --- : --- : --- end --- --- --- --- --- --- DECODING AND STRICT TYPES --- --- Because both JSON objects and JSON arrays are converted to Lua tables, --- it's not normally possible to tell which original JSON type a --- particular Lua table was derived from, or guarantee decode-encode --- round-trip equivalency. --- --- However, if you enable strictTypes, e.g. --- --- JSON = assert(loadfile "JSON.lua")() --load the routines --- JSON.strictTypes = true --- --- then the Lua table resulting from the decoding of a JSON object or --- JSON array is marked via Lua metatable, so that when re-encoded with --- JSON:encode() it ends up as the appropriate JSON type. --- --- (This is not the default because other routines may not work well with --- tables that have a metatable set, for example, Lightroom API calls.) --- --- --- ENCODING (from a lua table to a JSON string) --- --- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines --- --- local raw_json_text = JSON:encode(lua_table_or_value) --- local pretty_json_text = JSON:encode_pretty(lua_table_or_value) -- "pretty printed" version for human readability --- local custom_pretty = JSON:encode(lua_table_or_value, etc, { pretty = true, indent = "| ", align_keys = false }) --- --- On error during encoding, this code calls: --- --- JSON:onEncodeError(message, etc) --- --- which you can override in your local JSON object. --- --- The 'etc' in the error call is the second argument to encode() --- and encode_pretty(), or nil if it wasn't provided. --- --- --- PRETTY-PRINTING --- --- An optional third argument, a table of options, allows a bit of --- configuration about how the encoding takes place: --- --- pretty = JSON:encode(val, etc, { --- pretty = true, -- if false, no other options matter --- indent = " ", -- this provides for a three-space indent per nesting level --- align_keys = false, -- see below --- }) --- --- encode() and encode_pretty() are identical except that encode_pretty() --- provides a default options table if none given in the call: --- --- { pretty = true, align_keys = false, indent = " " } --- --- For example, if --- --- JSON:encode(data) --- --- produces: --- --- {"city":"Kyoto","climate":{"avg_temp":16,"humidity":"high","snowfall":"minimal"},"country":"Japan","wards":11} --- --- then --- --- JSON:encode_pretty(data) --- --- produces: --- --- { --- "city": "Kyoto", --- "climate": { --- "avg_temp": 16, --- "humidity": "high", --- "snowfall": "minimal" --- }, --- "country": "Japan", --- "wards": 11 --- } --- --- The following three lines return identical results: --- JSON:encode_pretty(data) --- JSON:encode_pretty(data, nil, { pretty = true, align_keys = false, indent = " " }) --- JSON:encode (data, nil, { pretty = true, align_keys = false, indent = " " }) --- --- An example of setting your own indent string: --- --- JSON:encode_pretty(data, nil, { pretty = true, indent = "| " }) --- --- produces: --- --- { --- | "city": "Kyoto", --- | "climate": { --- | | "avg_temp": 16, --- | | "humidity": "high", --- | | "snowfall": "minimal" --- | }, --- | "country": "Japan", --- | "wards": 11 --- } --- --- An example of setting align_keys to true: --- --- JSON:encode_pretty(data, nil, { pretty = true, indent = " ", align_keys = true }) --- --- produces: --- --- { --- "city": "Kyoto", --- "climate": { --- "avg_temp": 16, --- "humidity": "high", --- "snowfall": "minimal" --- }, --- "country": "Japan", --- "wards": 11 --- } --- --- which I must admit is kinda ugly, sorry. This was the default for --- encode_pretty() prior to version 20141223.14. --- --- --- AMBIGUOUS SITUATIONS DURING THE ENCODING --- --- During the encode, if a Lua table being encoded contains both string --- and numeric keys, it fits neither JSON's idea of an object, nor its --- idea of an array. To get around this, when any string key exists (or --- when non-positive numeric keys exist), numeric keys are converted to --- strings. --- --- For example, --- JSON:encode({ "one", "two", "three", SOMESTRING = "some string" })) --- produces the JSON object --- {"1":"one","2":"two","3":"three","SOMESTRING":"some string"} --- --- To prohibit this conversion and instead make it an error condition, set --- JSON.noKeyConversion = true --- - - - - --- --- SUMMARY OF METHODS YOU CAN OVERRIDE IN YOUR LOCAL LUA JSON OBJECT --- --- assert --- onDecodeError --- onDecodeOfNilError --- onDecodeOfHTMLError --- onEncodeError --- --- If you want to create a separate Lua JSON object with its own error handlers, --- you can reload JSON.lua or use the :new() method. --- ---------------------------------------------------------------------------- - -local default_pretty_indent = " " -local default_pretty_options = { pretty = true, align_keys = false, indent = default_pretty_indent } - -local isArray = { __tostring = function() return "JSON array" end } isArray.__index = isArray -local isObject = { __tostring = function() return "JSON object" end } isObject.__index = isObject - - -function OBJDEF:newArray(tbl) - return setmetatable(tbl or {}, isArray) -end - -function OBJDEF:newObject(tbl) - return setmetatable(tbl or {}, isObject) -end - -local function unicode_codepoint_as_utf8(codepoint) - -- - -- codepoint is a number - -- - if codepoint <= 127 then - return string.char(codepoint) - - elseif codepoint <= 2047 then - -- - -- 110yyyxx 10xxxxxx <-- useful notation from http://en.wikipedia.org/wiki/Utf8 - -- - local highpart = math.floor(codepoint / 0x40) - local lowpart = codepoint - (0x40 * highpart) - return string.char(0xC0 + highpart, - 0x80 + lowpart) - - elseif codepoint <= 65535 then - -- - -- 1110yyyy 10yyyyxx 10xxxxxx - -- - local highpart = math.floor(codepoint / 0x1000) - local remainder = codepoint - 0x1000 * highpart - local midpart = math.floor(remainder / 0x40) - local lowpart = remainder - 0x40 * midpart - - highpart = 0xE0 + highpart - midpart = 0x80 + midpart - lowpart = 0x80 + lowpart - - -- - -- Check for an invalid character (thanks Andy R. at Adobe). - -- See table 3.7, page 93, in http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#G28070 - -- - if ( highpart == 0xE0 and midpart < 0xA0 ) or - ( highpart == 0xED and midpart > 0x9F ) or - ( highpart == 0xF0 and midpart < 0x90 ) or - ( highpart == 0xF4 and midpart > 0x8F ) - then - return "?" - else - return string.char(highpart, - midpart, - lowpart) - end - - else - -- - -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx - -- - local highpart = math.floor(codepoint / 0x40000) - local remainder = codepoint - 0x40000 * highpart - local midA = math.floor(remainder / 0x1000) - remainder = remainder - 0x1000 * midA - local midB = math.floor(remainder / 0x40) - local lowpart = remainder - 0x40 * midB - - return string.char(0xF0 + highpart, - 0x80 + midA, - 0x80 + midB, - 0x80 + lowpart) - end -end - -function OBJDEF:onDecodeError(message, text, location, etc) - if text then - if location then - message = string.format("%s at char %d of: %s", message, location, text) - else - message = string.format("%s: %s", message, text) - end - end - - if etc ~= nil then - message = message .. " (" .. OBJDEF:encode(etc) .. ")" - end - - if self.assert then - self.assert(false, message) - else - assert(false, message) - end -end - -OBJDEF.onDecodeOfNilError = OBJDEF.onDecodeError -OBJDEF.onDecodeOfHTMLError = OBJDEF.onDecodeError - -function OBJDEF:onEncodeError(message, etc) - if etc ~= nil then - message = message .. " (" .. OBJDEF:encode(etc) .. ")" - end - - if self.assert then - self.assert(false, message) - else - assert(false, message) - end -end - -local function grok_number(self, text, start, etc) - -- - -- Grab the integer part - -- - local integer_part = text:match('^-?[1-9]%d*', start) - or text:match("^-?0", start) - - if not integer_part then - self:onDecodeError("expected number", text, start, etc) - end - - local i = start + integer_part:len() - - -- - -- Grab an optional decimal part - -- - local decimal_part = text:match('^%.%d+', i) or "" - - i = i + decimal_part:len() - - -- - -- Grab an optional exponential part - -- - local exponent_part = text:match('^[eE][-+]?%d+', i) or "" - - i = i + exponent_part:len() - - local full_number_text = integer_part .. decimal_part .. exponent_part - local as_number = tonumber(full_number_text) - - if not as_number then - self:onDecodeError("bad number", text, start, etc) - end - - return as_number, i -end - - -local function grok_string(self, text, start, etc) - - if text:sub(start,start) ~= '"' then - self:onDecodeError("expected string's opening quote", text, start, etc) - end - - local i = start + 1 -- +1 to bypass the initial quote - local text_len = text:len() - local VALUE = "" - while i <= text_len do - local c = text:sub(i,i) - if c == '"' then - return VALUE, i + 1 - end - if c ~= '\\' then - VALUE = VALUE .. c - i = i + 1 - elseif text:match('^\\b', i) then - VALUE = VALUE .. "\b" - i = i + 2 - elseif text:match('^\\f', i) then - VALUE = VALUE .. "\f" - i = i + 2 - elseif text:match('^\\n', i) then - VALUE = VALUE .. "\n" - i = i + 2 - elseif text:match('^\\r', i) then - VALUE = VALUE .. "\r" - i = i + 2 - elseif text:match('^\\t', i) then - VALUE = VALUE .. "\t" - i = i + 2 - else - local hex = text:match('^\\u([0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF])', i) - if hex then - i = i + 6 -- bypass what we just read - - -- We have a Unicode codepoint. It could be standalone, or if in the proper range and - -- followed by another in a specific range, it'll be a two-code surrogate pair. - local codepoint = tonumber(hex, 16) - if codepoint >= 0xD800 and codepoint <= 0xDBFF then - -- it's a hi surrogate... see whether we have a following low - local lo_surrogate = text:match('^\\u([dD][cdefCDEF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF])', i) - if lo_surrogate then - i = i + 6 -- bypass the low surrogate we just read - codepoint = 0x2400 + (codepoint - 0xD800) * 0x400 + tonumber(lo_surrogate, 16) - else - -- not a proper low, so we'll just leave the first codepoint as is and spit it out. - end - end - VALUE = VALUE .. unicode_codepoint_as_utf8(codepoint) - - else - - -- just pass through what's escaped - VALUE = VALUE .. text:match('^\\(.)', i) - i = i + 2 - end - end - end - - self:onDecodeError("unclosed string", text, start, etc) -end - -local function skip_whitespace(text, start) - - local _, match_end = text:find("^[ \n\r\t]+", start) -- [http://www.ietf.org/rfc/rfc4627.txt] Section 2 - if match_end then - return match_end + 1 - else - return start - end -end - -local grok_one -- assigned later - -local function grok_object(self, text, start, etc) - if text:sub(start,start) ~= '{' then - self:onDecodeError("expected '{'", text, start, etc) - end - - local i = skip_whitespace(text, start + 1) -- +1 to skip the '{' - - local VALUE = self.strictTypes and self:newObject { } or { } - - if text:sub(i,i) == '}' then - return VALUE, i + 1 - end - local text_len = text:len() - while i <= text_len do - local key, new_i = grok_string(self, text, i, etc) - - i = skip_whitespace(text, new_i) - - if text:sub(i, i) ~= ':' then - self:onDecodeError("expected colon", text, i, etc) - end - - i = skip_whitespace(text, i + 1) - - local new_val, new_i = grok_one(self, text, i) - - VALUE[key] = new_val - - -- - -- Expect now either '}' to end things, or a ',' to allow us to continue. - -- - i = skip_whitespace(text, new_i) - - local c = text:sub(i,i) - - if c == '}' then - return VALUE, i + 1 - end - - if text:sub(i, i) ~= ',' then - self:onDecodeError("expected comma or '}'", text, i, etc) - end - - i = skip_whitespace(text, i + 1) - end - - self:onDecodeError("unclosed '{'", text, start, etc) -end - -local function grok_array(self, text, start, etc) - if text:sub(start,start) ~= '[' then - self:onDecodeError("expected '['", text, start, etc) - end - - local i = skip_whitespace(text, start + 1) -- +1 to skip the '[' - local VALUE = self.strictTypes and self:newArray { } or { } - if text:sub(i,i) == ']' then - return VALUE, i + 1 - end - - local VALUE_INDEX = 1 - - local text_len = text:len() - while i <= text_len do - local val, new_i = grok_one(self, text, i) - - -- can't table.insert(VALUE, val) here because it's a no-op if val is nil - VALUE[VALUE_INDEX] = val - VALUE_INDEX = VALUE_INDEX + 1 - - i = skip_whitespace(text, new_i) - - -- - -- Expect now either ']' to end things, or a ',' to allow us to continue. - -- - local c = text:sub(i,i) - if c == ']' then - return VALUE, i + 1 - end - if text:sub(i, i) ~= ',' then - self:onDecodeError("expected comma or '['", text, i, etc) - end - i = skip_whitespace(text, i + 1) - end - self:onDecodeError("unclosed '['", text, start, etc) -end - - -grok_one = function(self, text, start, etc) - -- Skip any whitespace - start = skip_whitespace(text, start) - - if start > text:len() then - self:onDecodeError("unexpected end of string", text, nil, etc) - end - - if text:find('^"', start) then - return grok_string(self, text, start, etc) - - elseif text:find('^[-0123456789 ]', start) then - return grok_number(self, text, start, etc) - - elseif text:find('^%{', start) then - return grok_object(self, text, start, etc) - - elseif text:find('^%[', start) then - return grok_array(self, text, start, etc) - - elseif text:find('^true', start) then - return true, start + 4 - - elseif text:find('^false', start) then - return false, start + 5 - - elseif text:find('^null', start) then - return nil, start + 4 - - else - self:onDecodeError("can't parse JSON", text, start, etc) - end -end - -function OBJDEF:decode(text, etc) - if type(self) ~= 'table' or self.__index ~= OBJDEF then - OBJDEF:onDecodeError("JSON:decode must be called in method format", nil, nil, etc) - end - - if text == nil then - self:onDecodeOfNilError(string.format("nil passed to JSON:decode()"), nil, nil, etc) - elseif type(text) ~= 'string' then - self:onDecodeError(string.format("expected string argument to JSON:decode(), got %s", type(text)), nil, nil, etc) - end - - if text:match('^%s*$') then - return nil - end - - if text:match('^%s*<') then - -- Can't be JSON... we'll assume it's HTML - self:onDecodeOfHTMLError(string.format("html passed to JSON:decode()"), text, nil, etc) - end - - -- - -- Ensure that it's not UTF-32 or UTF-16. - -- Those are perfectly valid encodings for JSON (as per RFC 4627 section 3), - -- but this package can't handle them. - -- - if text:sub(1,1):byte() == 0 or (text:len() >= 2 and text:sub(2,2):byte() == 0) then - self:onDecodeError("JSON package groks only UTF-8, sorry", text, nil, etc) - end - - local success, value = pcall(grok_one, self, text, 1, etc) - - if success then - return value - else - -- if JSON:onDecodeError() didn't abort out of the pcall, we'll have received the error message here as "value", so pass it along as an assert. - if self.assert then - self.assert(false, value) - else - assert(false, value) - end - -- and if we're still here, return a nil and throw the error message on as a second arg - return nil, value - end -end - -local function backslash_replacement_function(c) - if c == "\n" then - return "\\n" - elseif c == "\r" then - return "\\r" - elseif c == "\t" then - return "\\t" - elseif c == "\b" then - return "\\b" - elseif c == "\f" then - return "\\f" - elseif c == '"' then - return '\\"' - elseif c == '\\' then - return '\\\\' - else - return string.format("\\u%04x", c:byte()) - end -end - -local chars_to_be_escaped_in_JSON_string - = '[' - .. '"' -- class sub-pattern to match a double quote - .. '%\\' -- class sub-pattern to match a backslash - .. '%z' -- class sub-pattern to match a null - .. '\001' .. '-' .. '\031' -- class sub-pattern to match control characters - .. ']' - -local function json_string_literal(value) - local newval = value:gsub(chars_to_be_escaped_in_JSON_string, backslash_replacement_function) - return '"' .. newval .. '"' -end - -local function object_or_array(self, T, etc) - -- - -- We need to inspect all the keys... if there are any strings, we'll convert to a JSON - -- object. If there are only numbers, it's a JSON array. - -- - -- If we'll be converting to a JSON object, we'll want to sort the keys so that the - -- end result is deterministic. - -- - local string_keys = { } - local number_keys = { } - local number_keys_must_be_strings = false - local maximum_number_key - - for key in pairs(T) do - if type(key) == 'string' then - table.insert(string_keys, key) - elseif type(key) == 'number' then - table.insert(number_keys, key) - if key <= 0 or key >= math.huge then - number_keys_must_be_strings = true - elseif not maximum_number_key or key > maximum_number_key then - maximum_number_key = key - end - else - self:onEncodeError("can't encode table with a key of type " .. type(key), etc) - end - end - - if #string_keys == 0 and not number_keys_must_be_strings then - -- - -- An empty table, or a numeric-only array - -- - if #number_keys > 0 then - return nil, maximum_number_key -- an array - elseif tostring(T) == "JSON array" then - return nil - elseif tostring(T) == "JSON object" then - return { } - else - -- have to guess, so we'll pick array, since empty arrays are likely more common than empty objects - return nil - end - end - - table.sort(string_keys) - - local map - if #number_keys > 0 then - -- - -- If we're here then we have either mixed string/number keys, or numbers inappropriate for a JSON array - -- It's not ideal, but we'll turn the numbers into strings so that we can at least create a JSON object. - -- - - if self.noKeyConversion then - self:onEncodeError("a table with both numeric and string keys could be an object or array; aborting", etc) - end - - -- - -- Have to make a shallow copy of the source table so we can remap the numeric keys to be strings - -- - map = { } - for key, val in pairs(T) do - map[key] = val - end - - table.sort(number_keys) - - -- - -- Throw numeric keys in there as strings - -- - for _, number_key in ipairs(number_keys) do - local string_key = tostring(number_key) - if map[string_key] == nil then - table.insert(string_keys , string_key) - map[string_key] = T[number_key] - else - self:onEncodeError("conflict converting table with mixed-type keys into a JSON object: key " .. number_key .. " exists both as a string and a number.", etc) - end - end - end - - return string_keys, nil, map -end - --- --- Encode --- --- 'options' is nil, or a table with possible keys: --- pretty -- if true, return a pretty-printed version --- indent -- a string (usually of spaces) used to indent each nested level --- align_keys -- if true, align all the keys when formatting a table --- -local encode_value -- must predeclare because it calls itself -function encode_value(self, value, parents, etc, options, indent) - - if value == nil then - return 'null' - - elseif type(value) == 'string' then - return json_string_literal(value) - - elseif type(value) == 'number' then - if value ~= value then - -- - -- NaN (Not a Number). - -- JSON has no NaN, so we have to fudge the best we can. This should really be a package option. - -- - return "null" - elseif value >= math.huge then - -- - -- Positive infinity. JSON has no INF, so we have to fudge the best we can. This should - -- really be a package option. Note: at least with some implementations, positive infinity - -- is both ">= math.huge" and "<= -math.huge", which makes no sense but that's how it is. - -- Negative infinity is properly "<= -math.huge". So, we must be sure to check the ">=" - -- case first. - -- - return "1e+9999" - elseif value <= -math.huge then - -- - -- Negative infinity. - -- JSON has no INF, so we have to fudge the best we can. This should really be a package option. - -- - return "-1e+9999" - else - return tostring(value) - end - - elseif type(value) == 'boolean' then - return tostring(value) - - elseif type(value) ~= 'table' then - self:onEncodeError("can't convert " .. type(value) .. " to JSON", etc) - - else - -- - -- A table to be converted to either a JSON object or array. - -- - local T = value - - if type(options) ~= 'table' then - options = {} - end - if type(indent) ~= 'string' then - indent = "" - end - - if parents[T] then - self:onEncodeError("table " .. tostring(T) .. " is a child of itself", etc) - else - parents[T] = true - end - - local result_value - - local object_keys, maximum_number_key, map = object_or_array(self, T, etc) - if maximum_number_key then - -- - -- An array... - -- - local ITEMS = { } - for i = 1, maximum_number_key do - table.insert(ITEMS, encode_value(self, T[i], parents, etc, options, indent)) - end - - if options.pretty then - result_value = "[ " .. table.concat(ITEMS, ", ") .. " ]" - else - result_value = "[" .. table.concat(ITEMS, ",") .. "]" - end - - elseif object_keys then - -- - -- An object - -- - local TT = map or T - - if options.pretty then - - local KEYS = { } - local max_key_length = 0 - for _, key in ipairs(object_keys) do - local encoded = encode_value(self, tostring(key), parents, etc, options, indent) - if options.align_keys then - max_key_length = math.max(max_key_length, #encoded) - end - table.insert(KEYS, encoded) - end - local key_indent = indent .. tostring(options.indent or "") - local subtable_indent = key_indent .. string.rep(" ", max_key_length) .. (options.align_keys and " " or "") - local FORMAT = "%s%" .. string.format("%d", max_key_length) .. "s: %s" - - local COMBINED_PARTS = { } - for i, key in ipairs(object_keys) do - local encoded_val = encode_value(self, TT[key], parents, etc, options, subtable_indent) - table.insert(COMBINED_PARTS, string.format(FORMAT, key_indent, KEYS[i], encoded_val)) - end - result_value = "{\n" .. table.concat(COMBINED_PARTS, ",\n") .. "\n" .. indent .. "}" - - else - - local PARTS = { } - for _, key in ipairs(object_keys) do - local encoded_val = encode_value(self, TT[key], parents, etc, options, indent) - local encoded_key = encode_value(self, tostring(key), parents, etc, options, indent) - table.insert(PARTS, string.format("%s:%s", encoded_key, encoded_val)) - end - result_value = "{" .. table.concat(PARTS, ",") .. "}" - - end - else - -- - -- An empty array/object... we'll treat it as an array, though it should really be an option - -- - result_value = "[]" - end - - parents[T] = false - return result_value - end -end - - -function OBJDEF:encode(value, etc, options) - if type(self) ~= 'table' or self.__index ~= OBJDEF then - OBJDEF:onEncodeError("JSON:encode must be called in method format", etc) - end - return encode_value(self, value, {}, etc, options or nil) -end - -function OBJDEF:encode_pretty(value, etc, options) - if type(self) ~= 'table' or self.__index ~= OBJDEF then - OBJDEF:onEncodeError("JSON:encode_pretty must be called in method format", etc) - end - return encode_value(self, value, {}, etc, options or default_pretty_options) -end - -function OBJDEF.__tostring() - return "JSON encode/decode package" -end - -OBJDEF.__index = OBJDEF - -function OBJDEF:new(args) - local new = { } - - if args then - for key, val in pairs(args) do - new[key] = val - end - end - - return setmetatable(new, OBJDEF) -end - -return OBJDEF:new() - --- --- Version history: --- --- 20141223.14 The encode_pretty() routine produced fine results for small datasets, but isn't really --- appropriate for anything large, so with help from Alex Aulbach I've made the encode routines --- more flexible, and changed the default encode_pretty() to be more generally useful. --- --- Added a third 'options' argument to the encode() and encode_pretty() routines, to control --- how the encoding takes place. --- --- Updated docs to add assert() call to the loadfile() line, just as good practice so that --- if there is a problem loading JSON.lua, the appropriate error message will percolate up. --- --- 20140920.13 Put back (in a way that doesn't cause warnings about unused variables) the author string, --- so that the source of the package, and its version number, are visible in compiled copies. --- --- 20140911.12 Minor lua cleanup. --- Fixed internal reference to 'JSON.noKeyConversion' to reference 'self' instead of 'JSON'. --- (Thanks to SmugMug's David Parry for these.) --- --- 20140418.11 JSON nulls embedded within an array were being ignored, such that --- ["1",null,null,null,null,null,"seven"], --- would return --- {1,"seven"} --- It's now fixed to properly return --- {1, nil, nil, nil, nil, nil, "seven"} --- Thanks to "haddock" for catching the error. --- --- 20140116.10 The user's JSON.assert() wasn't always being used. Thanks to "blue" for the heads up. --- --- 20131118.9 Update for Lua 5.3... it seems that tostring(2/1) produces "2.0" instead of "2", --- and this caused some problems. --- --- 20131031.8 Unified the code for encode() and encode_pretty(); they had been stupidly separate, --- and had of course diverged (encode_pretty didn't get the fixes that encode got, so --- sometimes produced incorrect results; thanks to Mattie for the heads up). --- --- Handle encoding tables with non-positive numeric keys (unlikely, but possible). --- --- If a table has both numeric and string keys, or its numeric keys are inappropriate --- (such as being non-positive or infinite), the numeric keys are turned into --- string keys appropriate for a JSON object. So, as before, --- JSON:encode({ "one", "two", "three" }) --- produces the array --- ["one","two","three"] --- but now something with mixed key types like --- JSON:encode({ "one", "two", "three", SOMESTRING = "some string" })) --- instead of throwing an error produces an object: --- {"1":"one","2":"two","3":"three","SOMESTRING":"some string"} --- --- To maintain the prior throw-an-error semantics, set --- JSON.noKeyConversion = true --- --- 20131004.7 Release under a Creative Commons CC-BY license, which I should have done from day one, sorry. --- --- 20130120.6 Comment update: added a link to the specific page on my blog where this code can --- be found, so that folks who come across the code outside of my blog can find updates --- more easily. --- --- 20111207.5 Added support for the 'etc' arguments, for better error reporting. --- --- 20110731.4 More feedback from David Kolf on how to make the tests for Nan/Infinity system independent. --- --- 20110730.3 Incorporated feedback from David Kolf at http://lua-users.org/wiki/JsonModules: --- --- * When encoding lua for JSON, Sparse numeric arrays are now handled by --- spitting out full arrays, such that --- JSON:encode({"one", "two", [10] = "ten"}) --- returns --- ["one","two",null,null,null,null,null,null,null,"ten"] --- --- In 20100810.2 and earlier, only up to the first non-null value would have been retained. --- --- * When encoding lua for JSON, numeric value NaN gets spit out as null, and infinity as "1+e9999". --- Version 20100810.2 and earlier created invalid JSON in both cases. --- --- * Unicode surrogate pairs are now detected when decoding JSON. --- --- 20100810.2 added some checking to ensure that an invalid Unicode character couldn't leak in to the UTF-8 encoding --- --- 20100731.1 initial public release --- diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..800baae --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +run: + make clean + docker build -t img . + docker run --rm img test + +clean: + rm -rf img diff --git a/bad-params.txt b/bad-params.txt deleted file mode 100644 index 24dcbb0..0000000 --- a/bad-params.txt +++ /dev/null @@ -1,64 +0,0 @@ -utm_source -utm_medium -utm_campaign -utm_term -utm_content -utm_adgroup -ref -refsrc -referrer_id -referrerid -src -i -s -ts -feature -jsessionid -phpsessid -aspsessionid -sessionid -zenid -sid -gclid -fb_xd_fragment -fb_comment_id -fbclid -cfid -cftoken -doing_wp_cron -pk_cpn -pk_campaign -pk_kwd -pk_keyword -piwik_campaign -piwik_kwd -ga_source -ga_medium -ga_term -ga_content -ga_campaign -ga_place -yclid -_openstat -fb_action_ids -fb_action_types -fb_source -fb_ref -action_object_map -action_type_map -action_ref_map -gs_l -mkt_tok -hmb_campaign -hmb_medium -hmb_source -rand -wicket:antiCache -cachebuster -nocache -vs -dilid -script_case_session -cid -extid -_flowexecutionkey diff --git a/bad-patterns.txt b/bad-patterns.txt deleted file mode 100644 index 9427158..0000000 --- a/bad-patterns.txt +++ /dev/null @@ -1,33 +0,0 @@ -/action/consumeSharedSessionAction -/action/consumeSsoCookie -/action/getSharedSiteSession -/juris/error%.jsf -facebook%.com/login%.php -facebook%.com/cookie/ -facebook%.com/plugins/ -facebook%.com/sharer/ -facebook%.com/sharer%.php -gongquiz%.com.+&historyNo=[0-9]+ -univis%.univie%.ac%.at/ausschreibungstellensuche/ -fundraise%.cancerresearchuk%.org/signup/account/ -mma%.ft%.com -^https?://dmg%.go%-2b%-planer%.de/ -^https?://3d%.espace%-aubade%.fr/ -^https?://kuechenplaner%.[^/]+/cloud/ -^https?://3d%-salledebains%.geberit%.fr/ -^https?://bibliotekanauki%.ceon%.pl/yadda/search/general%.action -^https?://[^/]+%.icm%.edu%.pl/.*search/article%.action -^https?://interamt%.de/koop/app/ -^https?://tesiunam%.dgb%.unam%.mx/F/ -^https?://[^%.]+%.sedelectronica%.es/.*%?x= -^https?://www%.cp%-cc%.org/programs%-services/ -/ibank/_crypt_ -%%7B%%7B.+%%7D%%7D -^https?://[^/]+/" -^http://[0-9a-z][0-9a-z][0-9a-z][0-9][0-9][0-9]?%.[^%./]+%.com/$ -^http://[0-9a-z][0-9a-z][0-9a-z][0-9][0-9][0-9]?%.[^%./]+%.com/[a-z]+%.?[a-z][a-z][a-z]?$ -^http://[0-9a-z][0-9a-z][0-9a-z][0-9][0-9][0-9]?%.[^%./]+%.com/[a-z]+/[a-z]+[0-9]*%.?[a-z][a-z][a-z]?$ -^https?://[^/]*yahoo%.com/.+%%5C.+at%.atwola%.com -^https?://[^/]*at%.atwola%.com/ -^https?://www%.bafa%.de/ -%%5C%%22 diff --git a/boilerplate.py b/boilerplate.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/boilerplate.py @@ -0,0 +1 @@ + diff --git a/extract-outlinks-patterns.txt b/extract-outlinks-patterns.txt deleted file mode 100644 index bfa4645..0000000 --- a/extract-outlinks-patterns.txt +++ /dev/null @@ -1,1112 +0,0 @@ -15min.org -15minut.org -1prime.biz -24-ore.com -24.ae -7days.ae -8am.af -aamulehti.fi -abc.news.go.com -abstvradio.com -accringtonobserver.co.uk -acn.cu -ad.nl -adelante.cu -adigea.aif.ru -adsports.ae -adventure.nationalgeographic.com -af.farsnews.com -af.reuters.com -afghan-review.com -afghanews.ir -afghanislamicpress.com -afghanistannewscenter.com -afghanistansun.com -afghanistantimes.af -afghanpaper.com -aftenposten.no -aftonbladet.se -agora.co.ao -ahora.cu -ahram.org.eg -aif.ru -aiis-albania.org -airdrieecho.com -akhbarelyoum.dz -akhersaa-dz.com -al-fadjr.com -al-monitor.com -alalam.ir -alarabiya.net -albawabaeg.com -albayan.ae -albertafarmexpress.ca -alfajr-news.net -alger-info.com -algerie-focus.com -algerieconfluences.com -alhadath.net -alittihad.ae -aljazeera.com -alkhaleej.ae -alkmaarnieuws.nl -almasdarnews.com -almogaz.com -alquds.co.uk -alroeya.ae -alrugby.com -alseyassi-dz.com -alwahdanews.ae -alwatannewspaper.ae -am.radiovaticana.va -ambito.com -ameinfo.com -ana.ad -andorradifusio.ad -angliya.com -angonoticias.com -annasrdz.com -annasronline.com -antiguaobserver.com -aopnews.com -ap22.ru -apple.com -appledaily.com.tw -appleinsider.com -appleinsider.net -appleinsider.ru -aps.dz -ar.farsnews.com -ar.radiovaticana.va -ar.reuters.com -ar.timesofisrael.com -ara.ad -ara.reuters.com -arabianbusiness.com -arabic.sport360.com -ariananet.com -arnhemnieuws.nl -arstechnica.com -asia.nikkei.com -aspi.org.au -aspistrategist.org.au -astro.fashion.qq.com -aswatmasriya.com -atlantic.ctvnews.ca -ausairpower.net -auto.qq.com -autonews.ru -b.dk -baby.qq.com -bakhtarnews.com.af -balkanweb.com -banglaexpress.ae -barkinganddagenhampost.co.uk -barrie.ctvnews.ca -barrietoday.com -baytoday.ca -bbc.co.uk -bbc.com -bc.ctvnews.ca -be.radiovaticana.va -becclesandbungayjournal.co.uk -belegger.nl -benawa.com -bencarson.com -berlinergazette.de -berniesanders.com -bet.nl -beuningennieuws.nl -bexleytimes.co.uk -bg.radiovaticana.va -bgr.com -bigstory.ap.org -birminghammail.co.uk -birminghampost.co.uk -biz.tpo.nl -blackcountrybugle.co.uk -blikopnieuws.nl -blog.archive.org -blog.cleveland.com -blog.lesoir.be -blogs.canoe.com -blogs.wsj.com -blogues.canoe.ca -bloomberg.co.jp -bloomberg.com -bna.bh -bnn.ca -bnr.nl -boingboing.net -bondebladet.no -bondia.ad -book.qq.com -boxmeernieuws.nl -br.radiovaticana.va -br.reuters.com -br.wsj.com -brantfordexpositor.ca -bredajournaal.nl -breitbart.com -brisinst.org.au -bromleytimes.co.uk -bt.dk -btp-dz.com -buenosairesherald.com -burymercury.co.uk -buzzfeed.com.au -ca.reuters.com -calgary.ctvnews.ca -calgaryherald.com -calgarysun.com -cambstimes.co.uk -cameroonpostline.com -camrosecanadian.com -cankaoxiaoxi.com -capitalnewyork.com -catholicherald.co.uk -ceda.com.au -cesd.az -channel4.com -channelnewsasia.com -chathamthisweek.com -chealth.canoe.com -chesterchronicle.co.uk -china.kyodonews.jp -chinatimes.com -chosonsinbo.com -chosun.com -chrischristie.com -chron.com -chroniclelive.co.uk -chroom.tpo.nl -cis.org.au -citynews.ca -clarin.com -class.qq.com -cleveland.com -clintonnewsrecord.com -cn.ibtimes.com -cn.nytimes.com -cn.timesofisrael.com -cn.wsj.com -cnbc.com -cnea.gov.ar -cnet.com -cnews.ru -coastalscene24.co.uk -cochranetimes.com -cochranetimespost.ca -collections.unu.edu -competition.dz -computersweden.idg.se -conae.gov.ar -conicet.gov.ar -contenidos.lanacion.com.ar -coventryobserver.co.uk -coventrytelegraph.net -cp24.com -cpd.org.au -cphpost.dk -cranbrookherald.com -crewechronicle.co.uk -cronica.com.ar -cronicamendoza.com -cs.radiovaticana.va -ctvnews.ca -cubainfo.acn.cu -cubanews.acn.cu -cubasi.com -cubasi.cu -cuijknieuws.nl -cul.qq.com -cult.tpo.nl -cultofandroid.com -cultofandroid.com.feedsportal.com -cultofmac.com -cultofmac.com.feedsportal.com -cultofmac.com.ua -cybersecuritydojo.com -czechcrunch.cz -dagelijksestandaard.nl -dagen.no -dagen.se -dagens.dk -dagogtid.no -dagsavisen.no -daily-mail.co.zm -dailyafghanistan.com -dailyfinance.com -dailyheraldtribune.com -dailymail.co.uk -dailynewsegypt.com -dailynk.com -dailypost.co.uk -dailyrecord.co.uk -dailystar.co.uk -dailystar.com.lb -dajia.qq.com -dari.wadsam.com -data.gdeltproject.org -de.radiovaticana.va -de.reuters.com -delfi.lt -demokraatti.fi -demorgen.be -denboschnieuws.nl -depechedekabylie.com -derehamtimes.co.uk -destructoid.com -deutschlandradio.de -deventerjournaal.nl -devpolicy.crawford.anu.edu.au -devpolicy.org -di.se -diariandorra.ad -diariobae.com -diariopopular.com.ar -diarioshow.com -digi.tech.qq.com -dissmercury.co.uk -dn.no -dn.se -dnaindia.com -dnd.nl -docsalud.com -donaldjtrump.com -donbalon.com -donbalon.eu -dp.ru -dprktoday.com -dr.dk -draytonvalleywesternreview.com -dubaichronicle.com -dunmowbroadcast.co.uk -dutchdailynews.com -dutchinamerica.com -dutchnews.nl -dw.com -eaber.org -eadt.co.uk -eastasiaforum.org -eastlondonadvertiser.co.uk -ech-chaab.com -echoroukonline.com -economictimes.indiatimes.com -edition.cnn.com -edmonton.ctvnews.ca -edmontonjournal.com -edmontonsun.com -edsonleader.com -edu.qq.com -eg-online.ru -eindhovennieuws.nl -ekstrabladet.dk -el-hakaek.com -el-hourria.com -el-massa.com -el-youm.info -elahdath.net -elbilad.net -elciudadanoweb.com -elcolombiano.com -eldjoumhouria.dz -electronicintifada.net -elheddaf.com -elkhabar.com -elkhabarerriadhi.com -elliotlaketoday.com -elmakam.com -elmassar-ar.com -elmoudjahid.com -elperiodic.ad -elraaed.com -elsevier.nl -elwatan.com -elystandard.co.uk -emaratalyoum.com -emirates247.com -en.alalam.ir -en.aswatmasriya.com -en.farsnews.com -en.gigazine.net -en.hawarnews.com -en.novayagazeta.ru -en.radiovaticana.va -en.video.canoe.tv -engadget.com -english.ahram.org.eg -english.chosun.com -english.juventudrebelde.cu -english.kyodonews.jp -english.yonhapnews.co.kr -ennaharonline.com -ent.qq.com -entv.dz -environment.nationalgeographic.com -eo.radiovaticana.va -eqmweekly.com.af -es.hawarnews.com -es.radiovaticana.va -es.reuters.com -escambray.cu -ess.fi -etn.fi -eufin.nl -euronews.com -evatt.org.au -eveningnews24.co.uk -exame.co.ao -examiner.co.uk -exiledonline.com -exmouthherald.co.uk -exmouthjournal.co.uk -express.co.uk -expressandstar.com -expressen.se -fa.timesofisrael.com -fakenhamtimes.co.uk -farsnews.com -fashion.qq.com -fd.nl -feeds.24.com -feeds.arstechnica.com -feeds.bbci.co.uk -feeds.cnevids.com -feeds.feedburner.com -feeds.feedburner.jp -feeds.gawker.com -feeds.government.nl -feeds.huffingtonpost.com -feeds.ign.com -feeds.kauppalehti.fi -feeds.macrumors.com -feeds.mashable.com -feeds.news24.com -feeds.nytimes.com -feeds.sciencedaily.com -feeds.skynews.com -feeds.washingtonpost.com -feeds.webwereld.nl -feeds.wsjonline.com -feeds2.feedburner.com -feweek.co.uk -fi.radiovaticana.va -fightland.vice.com -finance.qq.com -#finance.yahoo.com -fiskeribladet.no -flip.channelnewsasia.com -forbes.com -fortmcmurraytoday.com -fortsaskatchewanrecord.com -forum.ad -foxnews.com -foxue.qq.com -fr.canoe.ca -fr.radiovaticana.va -fr.reuters.com -fr.timesofisrael.com -fr.video.canoe.tv -france24.com -frenchwam.com -friheten.no -frontpage.fok.nl -ft.com -ftp3.conae.gov.ar -fullfact.org -futbolete.com -games.qq.com -gamespy.dk -gazeta-pravda.ru -gazeta55.al -gazetanovgorod.ru -gazetayakutia.ru -gazettelive.co.uk -gazettetimes.com -getbucks.co.uk -gethampshire.co.uk -getreading.co.uk -getsurrey.co.uk -getwestlondon.co.uk -gfwadvertiser.ca -gigazine.net -gizmodo.com -globalnews.ca -godubai.com -gongyi.qq.com -googleblog.blogspot.com -googleblog.blogspot.nl -gov.uk -government.nl -gp.se -granma.cu -grattan.edu.au -gravesendreporter.co.uk -greatyarmouthmercury.co.uk -greenun24.co.uk -groningenjournaal.nl -guardian.ng -guccifer2.wordpress.com -guelphtoday.com -guerrillero.cu -gulfnews.com -gulftoday.ae -guruwatch.nl -gva.be -haaretz.co.il -haaretz.com -hackneygazette.co.uk -halifaxtoday.ca -hamhigh.co.uk -hamhighbroadway.co.uk -hannaherald.com -hardenbergnieuws.nl -hawarnews.com -hbl.fi -hd.stheadline.com -he.radiovaticana.va -health.qq.com -health.usnews.com -helsinkitimes.fi -heraldlive.co.za -hertsad.co.uk -heute.de -heyetnet.org -hi.radiovaticana.va -highrivertimes.com -hillaryclinton.com -hinckleytimes.net -hintonparklander.com -hk.on.cc -hln.be -horizons-dz.com -house.qq.com -hr.radiovaticana.va -hrnicholls.com.au -hs.fi -hu.radiovaticana.va -huffingtonpost.com -huntspost.co.uk -hy.radiovaticana.va -i-d.vice.com -iamexpat.nl -ib.edu.ar -ibinda.com -ibtimes.co.in -ibtimes.co.uk -ibtimes.com -ibtimes.com.au -icelandreview.com -idag.no -iex.nl -iexgeld.nl -iexprofs.nl -ilfordrecorder.co.uk -ilkka.fi -iltalehti.fi -iltasanomat.fi -in.reuters.com -independent.co.uk -indianexpress.com -infocanuelas.com -infosoir.com -infoworld.com -inta.gob.ar -intelligencer.ca -international.nytimes.com -internationalaffairs.org.au -inti.gob.ar -inti.gov.ar -invasor.cu -io-tech.fi -ipa.org.au -ipolitics.ca -ips.cap.anu.edu.au -ipswichstar.co.uk -iraq-amsi.net -irna.ir -islingtongazette.co.uk -it.ibtimes.com -it.reuters.com -itv.com -itviikko.fi -izvestia.ru -ja.radiovaticana.va -japantimes.co.jp -jeugdjournaal.nl -jeune-independant.net -jia360.com -johnkasich.com -joop.nl -jornaldeangola.sapo.ao -jornaldosdesportos.sapo.ao -jornalf8.net -journaldemontreal.com -jp.ibtimes.com -jp.reuters.com -jp.techcrunch.com -jp.vice.com -jp.wsj.com -juventudrebelde.cu -jyllands-posten.dk -kabayanweekly.com -kabulpress.org -kaleva.fi -kansalainen.fi -kansanuutiset.fi -karjalainen.fi -karjalansanomat.ru -kawalisse.com -kbctv.co.ke -kenoradailyminerandnews.com -kentnews.co.uk -kentonline.co.uk -khaama.com -khabarafghan.com -khaleejtimes.com -kid.qq.com -kids.nationalgeographic.com -kilburntimes.co.uk -kincardinenews.com -kingstonthisweek.com -kitchener.ctvnews.ca -klassekampen.no -kodima.rkperiodika.ru -kohajone.com -kommersant.ru -koreatimes.co.kr -kotaku.com -kp.ru -kr.nknews.org -kr.radiovaticana.va -kristeligt-dagblad.dk -ksml.fi -ktimes.com -ku.hawarnews.com -lacapital.com.ar -lactualite-dz.info -lakeshoreadvance.com -lanacion.com.ar -lanueva.com -lapinkansa.fi -laprensa.com.ar -lapresse.tn -larawbar.net -larazon.com.ar -lat.wsj.com -latimes.com -latribune-dz.com -lautomarche.com -lavoz.com.ar -lawandtax-news.com -leaderpost.com -lejourdalgerie.com -leloir.org.ar -lemaghrebdz.com -lematindz.net -lemauricien.com -lemidi-dz.com -lemonde.fr -leparisien.fr -lequotidien-oran.com -lesnouvellesnews.fr -lesoir.be -lesoirdalgerie.com -lestrepublicain.com -letempsdz.com -lexpressiondz.com -lfpress.com -lgz.ru -liberte-algerie.com -libyaherald.com -lifehacker.com -live.huffingtonpost.com -liveleak.com -liverpoolecho.co.uk -lnr-dz.com -london.ctvnews.ca -looopings.nl -losandes.com.ar -loughboroughecho.net -lowestoftjournal.co.uk -lowyinstitute.org -lrt.lt -lt.radiovaticana.va -lta.reuters.com -ltn.com.tw -lv.radiovaticana.va -maaseuduntulevaisuus.fi -macclesfield-express.co.uk -mackungfu.org -macleans.ca -macrumors.com -macrumors.ro -madagascar-tribune.com -madamasr.com -mailonsunday.co.uk -managementherald.com.ar -manchestereveningnews.co.uk -mandegardaily.com -mannkal.org -mannwest.com -marcorubio.com -marketwatch.com -marmai.fi -marsad.ly -mashable.com -mashable.pw -mayerthorpefreelancer.com -media.tpo.nl -menziesrc.org -meridianbooster.com -mes.ad -metro.co.uk -metro.fi -metro.se -metrohk.com.hk -metronews.ca -metronieuws.nl -mg.co.za -middleeasteye.net -midnorthmonitor.com -midweekherald.co.uk -mikrobitti.fi -mil.qq.com -mingpao.com -mirror.co.uk -mk.radiovaticana.va -mk.ru -mkset.ru -ml.radiovaticana.va -mn.ru -mobilefeeds.wsj.com -moheet.com -money.rbc.ru -money.usnews.com -monitor.co.ug -montreal.ctvnews.ca -montrealgazette.com -morgenbladet.no -morningstaronline.co.uk -mospravda.ru -motherboard.vice.com -motors-dz.com -mountain-news.com -msnbc.com -munchies.vice.com -mundod.lavoz.com.ar -mx.dk -mx.reuters.com -naenara.com.kp -nanaimodailynews.com -nantonnews.com -nasdaq.com -nation.co.ke -nationalobserver.com -nationalpost.com -nationen.no -navbharattimes.indiatimes.com -nbcnews.com -nd.nl -nederlandnieuws.nl -nerjanieuws.nl -newburytoday.co.uk -newhamrecorder.co.uk -newizv.ru -newlookmedia.ru -news.com.au -news.cubasi.cu -news.ltn.com.tw -news.mingpao.com -news.nationalgeographic.com -news.nationalpost.com -news.qq.com -news.sky.com -news.tbs.co.jp -news.vice.com -news.vip-urlaub.de -news.yahoo.com -news24.com -newscentralasia.net -newsletter.co.uk -newsmonkey.be -newsrss.bbc.co.uk -newtimes.co.rw -newvision.co.ug -ng.ru -niagarafallsreview.ca -nieuws.tpo.nl -nijmegennieuws.nl -nikkei.com -nisnews.nl -nknews.org -nltimes.nl -noisey.vice.com -north-africa.com -northdevongazette.co.uk -northernontario.ctvnews.ca -northnorfolknews.co.uk -northsomersettimes.co.uk -norwichadvertiser24.co.uk -norwichgazette.com -nos.nl -notinet.icrt.cu -novayagazeta.ru -novojornal.co.ao -novosti.acn.cu -npr.org -nrc.nl -nrk.no -nsl-basketball.sport360.com -nsl-football.sport360.com -nsl.sport360.com -nu.nl -nugget.ca -nunatsiaqonline.ca -nycity.today -nyheder.tv2.dk -nypost.com -nytid.no -nytimes.com -nzherald.co.nz -o.canada.com -og.ru -ohio.com -one.iex.nl -onionstudios.com -opais.co.ao -orientaldaily.on.cc -osservatoreromano.va -ossnieuws.nl -ottawa.ctvnews.ca -ottawacitizen.com -ottawasun.com -ouarsenis.com -ouest-france.fr -ouestribune-dz.com -ourworld.unu.edu -outlookafghanistan.net -owensoundsuntimes.com -oxfordtimes.co.uk -pagina12.com.ar -pajhwok.com -panorama-sport.com -panorama.com.al -parool.nl -participaties.nl -pdc.tv -percapita.org.au -periodico26.cu -photography.nationalgeographic.com -pinchercreekecho.com -pl.radiovaticana.va -pm.gc.ca -pnp.ru -politico.com -politico.eu -politiek.tpo.nl -politifact.com -polygon.com -portalangop.co.ao -portfolio.lesoir.be -postzambia.com -powned.tv -pqbnews.com -pressandjournal.co.uk -presstv.ir -prnewsonline.com -prosper.org.au -province.ru -prrecordgazette.com -pt.radiovaticana.va -qq.com -quote.rbc.ru -quotidien-oran.com -radio.nrk.no -radioalgerie.dz -radiolome.tg -randpaul.com -raqqa-sl.com -rawstory.com -rbc.ru -rbth.com -readwrite.com -recorder.ca -redstar.ru -refdag.nl -regina.ctvnews.ca -regio.tpo.nl -republicoftogo.com -reuters.com -rg.ru -ria.ru -rionegro.com.ar -ro.radiovaticana.va -rodong.rep.kp -romfordrecorder.co.uk -rossendalefreepress.co.uk -royston-crow.co.uk -rss.canada.com -rss.canoe.com -rss.cnn.com -rss.dw.com -rss.feedsportal.com -rss.nytimes.com -rss.upi.com -rt.com -rtl7darts.nl -rtlnieuws.nl -ru.hawarnews.com -ru.radiovaticana.va -ru.reuters.com -rumbosdigital.com -ruokala.net -ruscur.ru -sabawoon.com -sackvilletribunepost.com -saffronwaldenreporter.co.uk -sam.az -sammobile.com -sargasso.nl -saskatoon.ctvnews.ca -satakunnankansa.fi -saultstar.com -savonsanomat.fi -sawt-alahrar.net -sci-news.com -sciencedaily.com -sciencenews.org -scotlandnow.dailyrecord.co.uk -semanarioeconomico.co.ao -sfgate.com -sidmouthherald.co.uk -siliconprairienews.com -simcoereformer.ca -sk.radiovaticana.va -sl.radiovaticana.va -sobesednik.ru -sootoday.com -sot.com.al -southportvisiter.co.uk -sovsakh.ru -sovsport.ru -spbvedomosti.ru -spiegel.de -sport-express.ru -sport.rbc.ru -sport360.com -sports.qq.com -sports.vice.com -#sports.yahoo.com -sq.radiovaticana.va -standaard.be -standard-freeholder.com -standard.co.uk -static.feed.rbc.ru -stcatharinesstandard.ca -std.stheadline.com -stheadline.com -stock.qq.com -stowmarketmercury.co.uk -stratfordbeaconherald.com -strathmorestandard.com -stthomastimesjournal.com -student.societyforscience.org -sudburymercury.co.uk -sunnewsonline.com -suomenmaa.fi -suomenuutiset.fi -super.ae -sustg.com -sv.radiovaticana.va -svd.se -svenska.yle.fi -svt.se -sw.radiovaticana.va -ta.radiovaticana.va -taand.com -tagesschau.de -tai.org.au -taipeitimes.com -talk.tpo.nl -taloussanomat.fi -tchina.kyodonews.jp -tech.qq.com -techcrunch.asia -techcrunch.cn -techcrunch.com -techradar.me -tedcruz.org -tehrantimes.com -tekniikanmaailma.fi -telegraaf.nl -telegraph.co.uk -thanhnien.vn -the-japan-news.com -theantiguan.com -thearabianpost.com -theatlantic.com -theautonet.com -thebeaverton.com -thechronicleherald.ca -thecomet.net -thecragandcanyon.ca -thecreatorsproject.vice.com -thedailyobserver.ca -thedailystar.net -theglobeandmail.com -theguardian.com -thehindu.com -theindependent.co.zw -theintercept.com -thelocal.fr -themoscowtimes.com -thenational.ae -thenationalstudent.com -thenextweb.com -theonion.com -thepeterboroughexaminer.com -theprovince.com -theregister.co.uk -therwandan.com -thestage.co.uk -thestandard.com.hk -thestar.com -thestarphoenix.com -thesudburystar.com -thesun.co.uk -thesydneyinstitute.com.au -thetfordandbrandontimes.co.uk -thetimes.co.uk -theverge.com -theweathernetwork.com -thewestonmercury.co.uk -thewhig.com -thisdaylive.com -thump.vice.com -ti.radiovaticana.va -tielnieuws.nl -tilburgnieuws.nl -time.com -times.co.zm -timescolonist.com -timesofindia.indiatimes.com -timesofisrael.com -timminspress.com -timminstoday.com -tivi.fi -tmz.com -today.ng -todayszaman.com -togozine.com -tolafghan.com -tomshardware.com -toronto.ctvnews.ca -torontosun.com -torrentfreak.com -tpo.nl -tr.farsnews.com -tr.hawarnews.com -trabajadores.cu -transactiondalgerie.com -travel.nationalgeographic.com -travel.usnews.com -tribune.com.pk -trouw.nl -trud.ru -ts.fi -tumentoday.ru -tuoitrenews.vn -tv.echoroukonline.com -tv.rbc.ru -tverlife.ru -tvt.tg -tweakers.net -twenterandnieuws.nl -uaeinteract.com -udennieuws.nl -udn.com -uk.radiovaticana.va -uk.reuters.com -unu.edu -upi.com -uriminzokkiri.com -usatoday.com -usnews.com -ussc.edu.au -utrechtjournaal.nl -uusisuomi.fi -v.qq.com -vancouverisland.ctvnews.ca -vancouverobserver.com -vanguardia.cu -vanguardngr.com -vaterland.li -vechorka.ru -vedomosti.ru -veghelnieuws.nl -veintitres.com.ar -vendingtimes.com -verkkouutiset.fi -vi.radiovaticana.va -vice.cn -vice.com -viceland.com -video.asia.nikkei.com -video.cnbc.com -video.nationalgeographic.com -video.usnews.com -video.vice.com -video.wired.com -videos.leparisien.fr -vihrealanka.fi -vl.no -vmnews.ru -vn.ru -volkskrant.nl -vos.lavoz.com.ar -vremya.ru -vulcanadvocate.com -wadsam.com -wakteldjazair.com -walesonline.co.uk -wam.ae -washingtonpost.com -wattonandswaffhamtimes.co.uk -waveneyadvertiser24.co.uk -web.kbcalgerie.tv -webwereld.nl -wharf.co.uk -whitecourtstar.com -whtimes.co.uk -wijchennieuws.nl -wikileaks.org -windsor.ctvnews.ca -windsorstar.com -winnipeg.ctvnews.ca -winnipegsun.com -wired.com -wisbechstandard.co.uk -woodstocksentinelreview.com -wsj.com -www3.nhk.or.jp -wymondhamandattleboroughmercury.co.uk -xinhuanet.com -yarmouthadvertiser24.co.uk -yemen-nn.com -yenisafak.com -yle.fi -ynet.co.il -ynetnews.com -yomiuri.co.jp -yonhapnews.co.kr -yonhapnews.feedsportal.com -ypgrojava.com -zaman.com.tr -zamanarabic.com -zamanfrance.fr -zamankurdi.com -zh.radiovaticana.va -zwollenieuws.nl - diff --git a/gmd.lua b/gmd.lua new file mode 100644 index 0000000..87641a5 --- /dev/null +++ b/gmd.lua @@ -0,0 +1,122 @@ +strin = "2~NzUwMCBzdGFycyBjOg==~4~3~9~1 month~6~1803945|2~SSBiZWF0IDYgaW5zYW5lIGRlbW9ucyBpbiAyNCBob3VycyBsbWFvOiBOZWNyb3BvbGlzLCBUaGUgQ2F2ZXJucyBJSSwgRWxlbWVudHMgWCwgWCBBZHZlbnR1cmUsIFNhZGlzbSwgYW5kIEJsYXN0ZXIgYzo=~4~21~9~8 months~6~1793260|2~L1wvXC9cIDwz~4~6~9~1 year~6~1785414|2~U2VudCBmcm9tIGlPUyBTaG9ydGN1dHMh~4~8~9~1 year~6~1776426|2~VGhpcyBjb21tZW50IHdhcyB1cGxvYWRlZCBmb3IgdGhlIEdEIERvY3Mh~4~5~9~1 year~6~1772719|2~VGhlIHRyaWxvZ3kgaGFzIGJlZW4gY29tcGxldGVkLi4uR0cgQWZ0ZXJtYXRoIQ==~4~8~9~1 year~6~1766450|2~Im93byIgLSBGb3VuZG15YmFsbA==~4~4~9~1 year~6~1766338|2~NTAwMCBzdGFycyE=~4~12~9~2 years~6~1756926|2~Qmxvb2RiYXRoIEdHISEh~4~24~9~2 years~6~1745624|2~QWxsZWdpYW5jZSAxMDAl~4~3~9~2 years~6~1744292#73:0:10" + +-- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua +function split(s, sep) + local fields = {} + + local sep = sep or " " + local pattern = string.format("([^%s]+)", sep) + string.gsub(s, pattern, function(c) fields[#fields + 1] = c end) + + return fields +end +-- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua +-- +GMD = {} +GMD["comments"] = {} + +function table.show(t, name, indent) + local cart -- a container + local autoref -- for self references + + --[[ counts the number of elements in a table + local function tablecount(t) + local n = 0 + for _, _ in pairs(t) do n = n+1 end + return n + end + ]] + -- (RiciLake) returns true if the table is empty + local function isemptytable(t) return next(t) == nil end + + local function basicSerialize (o) + local so = tostring(o) + if type(o) == "function" then + local info = debug.getinfo(o, "S") + -- info.name is nil because o is not a calling level + if info.what == "C" then + return string.format("%q", so .. ", C function") + else + -- the information is defined through lines + return string.format("%q", so .. ", defined in (" .. + info.linedefined .. "-" .. info.lastlinedefined .. + ")" .. info.source) + end + elseif type(o) == "number" or type(o) == "boolean" then + return so + else + return string.format("%q", so) + end + end + + local function addtocart (value, name, indent, saved, field) + indent = indent or "" + saved = saved or {} + field = field or name + + cart = cart .. indent .. field + + if type(value) ~= "table" then + cart = cart .. " = " .. basicSerialize(value) .. ";\n" + else + if saved[value] then + cart = cart .. " = {}; -- " .. saved[value] + .. " (self reference)\n" + autoref = autoref .. name .. " = " .. saved[value] .. ";\n" + else + saved[value] = name + --if tablecount(value) == 0 then + if isemptytable(value) then + cart = cart .. " = {};\n" + else + cart = cart .. " = {\n" + for k, v in pairs(value) do + k = basicSerialize(k) + local fname = string.format("%s[%s]", name, k) + field = string.format("[%s]", k) + -- three spaces between levels + addtocart(v, fname, indent .. " ", saved, field) + end + cart = cart .. indent .. "};\n" + end + end + end + end + + name = name or "__unnamed__" + if type(t) ~= "table" then + return name .. " = " .. basicSerialize(t) + end + cart, autoref = "", "" + addtocart(t, name, indent) + return cart .. autoref +end + +GMD["comments"]["parse"] = function(comment) + local splitted = split(comment, ":") + local retern = {} + retern.comment = splitted[1] + retern.account = splitted[2] + + retern.parsed = {} + retern.parsed.comment = {} + local data = split(retern.comment, "|") + for i=1, #data do + retern.parsed.comment[i] = {} + -- comment parser + local ndata = split(data[i], "~") + for j=1, #ndata do + if not (j % 2 == 0) then -- key + key = ndata[j] + else -- value + local value = ndata[j] + retern.parsed.comment[i][key] = value + end + end + end + -- print("DONE") + -- print(table.show(retern.parsed.comment)) + + return retern +end +print(GMD["comments"]["parse"](strin)) diff --git a/grab.lua b/grab.lua new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/grab.lua @@ -0,0 +1 @@ + diff --git a/ignore-patterns.txt b/ignore-patterns.txt deleted file mode 100644 index 12b7039..0000000 --- a/ignore-patterns.txt +++ /dev/null @@ -1,21 +0,0 @@ -[%?&]ver=[0-9a-zA-Z%.]*%.16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9] -[%?&]ver=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9] -[%?&]t=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ -[%?&]t=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]%.[0-9]+$ -[%?&]hash=16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ -%?16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ -%?16[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ -%?6[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ -%?v=[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$ -;extid=[0-9a-f]+$ -[%?&;]_flowexecutionkey= -[%?&;]sid= -[%?&;]cid= -[%?&;]jsessionid= -[%?&;]script_case_session= -[%?&;]Dilid= -[%?&;][pP][hH][pP][sS][eE][sS][sS][iI][dD]= -[%?&;]wtd= -[%?&;]nonce= -[%?&;]rnd= -^https?://[^/]+/index%.php%?s= diff --git a/page-requisite-patterns.txt b/page-requisite-patterns.txt deleted file mode 100644 index f519342..0000000 --- a/page-requisite-patterns.txt +++ /dev/null @@ -1,17 +0,0 @@ -%.apng -%.avif -%.gif -%.jpe?g -%.jfif -%.pjpeg -%.pjp -%.png -%.svg -%.webp -%.bmp -%.ico -%.cur -%.tif -%.tiff -%.js -%.css diff --git a/pipeline.py b/pipeline.py index d05b1bb..8dd0208 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,50 +1,41 @@ -# encoding=utf8 -import datetime -from distutils.version import StrictVersion -import hashlib -import json -import os -import random -import shutil -import socket -import subprocess -import sys -import threading -import time -import string -import sys +################### +###GEOMETRY DASH### +###GRAB SCRIPTS#### +################### -if sys.version_info[0] < 3: - from urllib import unquote - from urlparser import parse_qs -else: - from urllib.parse import unquote, parse_qs +# Based heavily off of ArchiveTeam/urls-grab -import requests import seesaw -from seesaw.config import realize, NumberConfigValue +from seesaw.project import * +from seesaw.tracker import * +from seesaw.util import * +from seesaw.pipeline import Pipeline from seesaw.externalprocess import WgetDownload from seesaw.item import ItemInterpolation, ItemValue -from seesaw.pipeline import Pipeline -from seesaw.project import Project from seesaw.task import SimpleTask, LimitConcurrent -from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \ - UploadWithTracker, SendDoneToTracker -from seesaw.util import find_executable -import zstandard -if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): - raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') - -LOCK = threading.Lock() +import hashlib +import shutil +import socket +import sys +project = Project( + title = "Geometry Dash", + project_html = """ +
Time to archive Geometry Dash?
+ """, +) ########################################################################### -# Find a useful Wget+Lua executable. +# The version number of this pipeline definition. # -# WGET_AT will be set to the first path that -# 1. does not crash with --version, and -# 2. prints the required version string +# Update this each time you make a non-cosmetic change. +# It will be added to the WARC files and reported to the tracker. +VERSION = '20220428.01' +#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' +TRACKER_ID = 'geometrytrash' +TRACKER_HOST = '172.17.0.1:8501' WGET_AT = find_executable( 'Wget+AT', @@ -60,25 +51,6 @@ WGET_AT = find_executable( if not WGET_AT: raise Exception('No usable Wget+At found.') - -########################################################################### -# The version number of this pipeline definition. -# -# Update this each time you make a non-cosmetic change. -# It will be added to the WARC files and reported to the tracker. -VERSION = '20220423.01' -#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' -TRACKER_ID = 'urls' -TRACKER_HOST = 'legacy-api.arpa.li' -MULTI_ITEM_SIZE = 40 -MAX_DUPES_LIST_SIZE = 10000 - -########################################################################### -# This section defines project-specific tasks. -# -# Simple tasks (tasks that do not need any concurrency) are based on the -# SimpleTask class and have a process(item) method that is called for -# each item. class CheckIP(SimpleTask): def __init__(self): SimpleTask.__init__(self, 'CheckIP') @@ -112,16 +84,6 @@ class CheckIP(SimpleTask): self._counter -= 1 -class CheckRequirements(SimpleTask): - def __init__(self): - SimpleTask.__init__(self, 'CheckRequirements') - self._checked = False - - def process(self, item): - if not self._checked: - assert shutil.which('pdftohtml') is not None - self._checked = True - class PrepareDirectories(SimpleTask): def __init__(self, warc_prefix): @@ -146,77 +108,8 @@ class PrepareDirectories(SimpleTask): time.strftime('%Y%m%d-%H%M%S') ]) - if not os.path.isfile('duplicate-urls.txt'): - open('duplicate-urls.txt', 'w').close() - - open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close() - open('%(item_dir)s/%(warc_file_base)s_bad-urls.txt' % item, 'w').close() - open('%(item_dir)s/%(warc_file_base)s_duplicate-urls.txt' % item, 'w').close() - - -class MoveFiles(SimpleTask): - def __init__(self): - SimpleTask.__init__(self, 'MoveFiles') - - def process(self, item): - os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, - '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item) - - shutil.rmtree('%(item_dir)s' % item) - - -class SetBadUrls(SimpleTask): - def __init__(self): - SimpleTask.__init__(self, 'SetBadUrls') - - def unquote_url(self, url): - temp = unquote(url) - while url != temp: - url = temp - temp = unquote(url) - return url - - def process(self, item): - item['item_name_original'] = item['item_name'] - items = item['item_name'].split('\0') - items_lower = [self.unquote_url(url).strip().lower() for url in item['item_urls']] - with open('%(item_dir)s/%(warc_file_base)s_bad-urls.txt' % item, 'r') as f: - for url in { - self.unquote_url(url).strip().lower() for url in f - }: - index = items_lower.index(url) - items.pop(index) - items_lower.pop(index) - item['item_name'] = '\0'.join(items) - - -class SetDuplicateUrls(SimpleTask): - def __init__(self): - SimpleTask.__init__(self, 'SetNewDuplicates') - - def process(self, item): - with LOCK: - self._process(item) - - def _process(self, item): - with open('duplicate-urls.txt', 'r') as f: - duplicates = {s.strip() for s in f} - with open('%(item_dir)s/%(warc_file_base)s_duplicate-urls.txt' % item, 'r') as f: - for url in f: - duplicates.add(url.strip()) - with open('duplicate-urls.txt', 'w') as f: - # choose randomly, to cycle periodically popular URLs - duplicates = list(duplicates) - random.shuffle(duplicates) - f.write('\n'.join(duplicates[:MAX_DUPES_LIST_SIZE])) - - -class MaybeSendDoneToTracker(SendDoneToTracker): - def enqueue(self, item): - if len(item['item_name']) == 0: - return self.complete_item(item) - return super(MaybeSendDoneToTracker, self).enqueue(item) - + open('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, 'w').close() + open('%(item_dir)s/%(warc_file_base)s_retry-urls.txt' % item, 'w').close() def get_hash(filename): with open(filename, 'rb') as in_file: @@ -224,104 +117,65 @@ def get_hash(filename): CWD = os.getcwd() PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py')) -LUA_SHA1 = get_hash(os.path.join(CWD, 'urls.lua')) +LUA_SHA1 = get_hash(os.path.join(CWD, 'grab.lua')) +GMD_LUA_SHA1 = get_hash(os.path.join(CWD, 'gmd.lua')) def stats_id_function(item): d = { 'pipeline_hash': PIPELINE_SHA1, 'lua_hash': LUA_SHA1, + 'gmd_lua_hash': GMD_LUA_SHA1, 'python_version': sys.version, } return d +class MoveFiles(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'MoveFiles') -class ZstdDict(object): - created = 0 - data = None + def process(self, item): + os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, + '%(data_dir)s/%(warc_file_base)s.warc.gz' % item) - @classmethod - def get_dict(cls): - if cls.data is not None and time.time() - cls.created < 1800: - return cls.data - response = requests.get( - 'https://legacy-api.arpa.li/dictionary', - params={ - 'project': TRACKER_ID - } - ) - response.raise_for_status() - response = response.json() - if cls.data is not None and response['id'] == cls.data['id']: - cls.created = time.time() - return cls.data - print('Downloading latest dictionary.') - response_dict = requests.get(response['url']) - response_dict.raise_for_status() - raw_data = response_dict.content - if hashlib.sha256(raw_data).hexdigest() != response['sha256']: - raise ValueError('Hash of downloaded dictionary does not match.') - if raw_data[:4] == b'\x28\xB5\x2F\xFD': - raw_data = zstandard.ZstdDecompressor().decompress(raw_data) - cls.data = { - 'id': response['id'], - 'dict': raw_data - } - cls.created = time.time() - return cls.data + shutil.rmtree('%(item_dir)s' % item) +class AwfulBackfeed(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'AwfulBackfeed') + + def process(self, item): + with open('%(item_dir)s/new_items' % item) as file: + new_items = file.read() class WgetArgs(object): def realize(self, item): - with open('user-agents.txt', 'r') as f: - USER_AGENT = random.choice(list(f)).strip() wget_args = [ 'timeout', '1000', WGET_AT, - '-U', USER_AGENT, '-v', '--content-on-error', - '--lua-script', 'urls.lua', + '--lua-script', 'grab.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), #'--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', - '--recursive', '--level=inf', - '--no-parent', '--timeout', '10', - '--tries', '2', + '--tries', '10', '--span-hosts', - '--page-requisites', - '--waitretry', '0', + '--waitretry', '5000', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), - '--warc-header', 'operator: Archive Team', + '--warc-header', 'operator: TheTechRobo