All done.
Seems to work. Discovery for TharpsonTheimer hasn't been done yet.
This commit is contained in:
parent
4879afb63d
commit
8df93e1d56
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
run:
|
||||
make clean
|
||||
docker build -t img .
|
||||
docker run --rm img test
|
||||
docker run -v "/media/thetechrobo/2tb/obitdata:/finished" --rm img TheTechRobo --concurrent 1
|
||||
|
||||
clean:
|
||||
rm -rf img
|
||||
|
|
139
gmd.lua
139
gmd.lua
|
@ -1,139 +0,0 @@
|
|||
-- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua
|
||||
function split(s, sep)
|
||||
local fields = {}
|
||||
|
||||
local sep = sep or " "
|
||||
local pattern = string.format("([^%s]+)", sep)
|
||||
string.gsub(s, pattern, function(c) fields[#fields + 1] = c end)
|
||||
|
||||
return fields
|
||||
end
|
||||
-- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua
|
||||
--
|
||||
GMD = {}
|
||||
GMD["comments"] = {}
|
||||
|
||||
function table.show(t, name, indent)
|
||||
local cart -- a container
|
||||
local autoref -- for self references
|
||||
|
||||
--[[ counts the number of elements in a table
|
||||
local function tablecount(t)
|
||||
local n = 0
|
||||
for _, _ in pairs(t) do n = n+1 end
|
||||
return n
|
||||
end
|
||||
]]
|
||||
-- (RiciLake) returns true if the table is empty
|
||||
local function isemptytable(t) return next(t) == nil end
|
||||
|
||||
local function basicSerialize (o)
|
||||
local so = tostring(o)
|
||||
if type(o) == "function" then
|
||||
local info = debug.getinfo(o, "S")
|
||||
-- info.name is nil because o is not a calling level
|
||||
if info.what == "C" then
|
||||
return string.format("%q", so .. ", C function")
|
||||
else
|
||||
-- the information is defined through lines
|
||||
return string.format("%q", so .. ", defined in (" ..
|
||||
info.linedefined .. "-" .. info.lastlinedefined ..
|
||||
")" .. info.source)
|
||||
end
|
||||
elseif type(o) == "number" or type(o) == "boolean" then
|
||||
return so
|
||||
else
|
||||
return string.format("%q", so)
|
||||
end
|
||||
end
|
||||
|
||||
local function addtocart (value, name, indent, saved, field)
|
||||
indent = indent or ""
|
||||
saved = saved or {}
|
||||
field = field or name
|
||||
|
||||
cart = cart .. indent .. field
|
||||
|
||||
if type(value) ~= "table" then
|
||||
cart = cart .. " = " .. basicSerialize(value) .. ";\n"
|
||||
else
|
||||
if saved[value] then
|
||||
cart = cart .. " = {}; -- " .. saved[value]
|
||||
.. " (self reference)\n"
|
||||
autoref = autoref .. name .. " = " .. saved[value] .. ";\n"
|
||||
else
|
||||
saved[value] = name
|
||||
--if tablecount(value) == 0 then
|
||||
if isemptytable(value) then
|
||||
cart = cart .. " = {};\n"
|
||||
else
|
||||
cart = cart .. " = {\n"
|
||||
for k, v in pairs(value) do
|
||||
k = basicSerialize(k)
|
||||
local fname = string.format("%s[%s]", name, k)
|
||||
field = string.format("[%s]", k)
|
||||
-- three spaces between levels
|
||||
addtocart(v, fname, indent .. " ", saved, field)
|
||||
end
|
||||
cart = cart .. indent .. "};\n"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
name = name or "__unnamed__"
|
||||
if type(t) ~= "table" then
|
||||
return name .. " = " .. basicSerialize(t)
|
||||
end
|
||||
cart, autoref = "", ""
|
||||
addtocart(t, name, indent)
|
||||
return cart .. autoref
|
||||
end
|
||||
|
||||
GMD.comments.mapping = {"levelID","comment","authorPlayerID","likes","dislikes","messageID","spam","authorAccountID","age","percent","modBadge","moderatorChatColor"} -- https://docs.gdprogra.me/#/resources/server/comment
|
||||
|
||||
GMD["comments"]["parse"] = function(comments)
|
||||
local comment = comments
|
||||
local splitted = split(comment, ":")
|
||||
if not splitted[2] then
|
||||
return false
|
||||
end
|
||||
local retern = {}
|
||||
retern.comment = splitted[1]
|
||||
retern.account = splitted[2]
|
||||
|
||||
retern.parsed = {}
|
||||
retern.parsed.comment = {}
|
||||
local data = split(retern.comment, "|")
|
||||
for i=1, #data do
|
||||
retern.parsed.comment[i] = {}
|
||||
-- comment parser
|
||||
local ndata = split(data[i], "~")
|
||||
for j=1, #ndata do
|
||||
if not (j % 2 == 0) then -- key
|
||||
key = ndata[j]
|
||||
key = GMD.comments.mapping[tonumber(key)]
|
||||
else -- value
|
||||
local value = ndata[j]
|
||||
retern.parsed.comment[i][key] = value
|
||||
end
|
||||
end
|
||||
end
|
||||
return retern
|
||||
end
|
||||
GMD["comments"]["getOneComment"] = function (self, comments, pos)
|
||||
local parsed = self.parse(comments)
|
||||
if not parsed then
|
||||
return false
|
||||
else
|
||||
return parsed.parsed.comment[pos or 1]
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
function GMDtest()
|
||||
local strin = "2~NzUwMCBzdGFycyBjOg==~4~3~9~1 month~6~1803945|2~SSBiZWF0IDYgaW5zYW5lIGRlbW9ucyBpbiAyNCBob3VycyBsbWFvOiBOZWNyb3BvbGlzLCBUaGUgQ2F2ZXJucyBJSSwgRWxlbWVudHMgWCwgWCBBZHZlbnR1cmUsIFNhZGlzbSwgYW5kIEJsYXN0ZXIgYzo=~4~21~9~8 months~6~1793260|2~L1wvXC9cIDwz~4~6~9~1 year~6~1785414|2~U2VudCBmcm9tIGlPUyBTaG9ydGN1dHMh~4~8~9~1 year~6~1776426|2~VGhpcyBjb21tZW50IHdhcyB1cGxvYWRlZCBmb3IgdGhlIEdEIERvY3Mh~4~5~9~1 year~6~1772719|2~VGhlIHRyaWxvZ3kgaGFzIGJlZW4gY29tcGxldGVkLi4uR0cgQWZ0ZXJtYXRoIQ==~4~8~9~1 year~6~1766450|2~Im93byIgLSBGb3VuZG15YmFsbA==~4~4~9~1 year~6~1766338|2~NTAwMCBzdGFycyE=~4~12~9~2 years~6~1756926|2~Qmxvb2RiYXRoIEdHISEh~4~24~9~2 years~6~1745624|2~QWxsZWdpYW5jZSAxMDAl~4~3~9~2 years~6~1744292#73:0:10"
|
||||
assert(GMD["comments"]:getOneComment(strin)["comment"] == "NzUwMCBzdGFycyBjOg==")
|
||||
assert(not GMD["comments"]:getOneComment("-1"))
|
||||
end
|
||||
GMDtest()
|
68
grab.lua
68
grab.lua
|
@ -1,6 +1,5 @@
|
|||
require "gmd"
|
||||
|
||||
NEW_ITEMS = {}
|
||||
local htmlparser = require "htmlparser"
|
||||
require "table_show"
|
||||
|
||||
function readAll(file)
|
||||
local f = assert(io.open(file, "rb"))
|
||||
|
@ -9,19 +8,54 @@ function readAll(file)
|
|||
return content
|
||||
end
|
||||
|
||||
-- print(table.show(false))
|
||||
wget.callbacks.httploop_result = function(url, err, http_stat)
|
||||
local data = readAll(http_stat.local_file)
|
||||
-- Time to make sure that it's a valid response.
|
||||
local result = GMD.comments.parse(data)
|
||||
if result then
|
||||
return wget.actions.NOTHING
|
||||
else
|
||||
io.stderr:write("\aYou've been IP-banned from Geometry Dash's servers. Sorry about that.\n")
|
||||
io.stderr:write("Please let us know in #geometrytrash on hackint!\n")
|
||||
io.stderr:write("Sleeping 69 seconds. (nice)\n")
|
||||
os.execute("sleep 69")
|
||||
return wget.actions.ABORT -- We've been banned
|
||||
end
|
||||
QUEUED_URLS = false
|
||||
|
||||
function startswith(text, prefix)
|
||||
return text:find(prefix, 1, true) == 1
|
||||
end
|
||||
|
||||
wget.callbacks.httploop_result = function(url, err, http_stat)
|
||||
io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n")
|
||||
end
|
||||
|
||||
wget.callbacks.get_urls = function(file, url, is_css, iri)
|
||||
local addedUrls = {}
|
||||
local data = readAll(file)
|
||||
io.stderr:write("Read data\n")
|
||||
if url:match("https://downsandsonfuneralhome%.com/tribute/details/[^/]+/Dr%-Alex%-Klym/obituary%.html") then
|
||||
local root = htmlparser.parse(data)
|
||||
io.stderr:write("Read root\n")
|
||||
local dataa ={}
|
||||
dataa.obit = root("#obituary-link-list-item a")
|
||||
dataa.cond = root("#condolences-link-list-item a")
|
||||
dataa.serv = root("#service-link-list-item a")
|
||||
dataa.mems = root("#memories-link-list-item a")
|
||||
dataa.char = root("#charities-link-list-item a")
|
||||
dataa.prin = root(".print-obit-btn a")
|
||||
assert(not dataa.cond[2])
|
||||
assert(not dataa.serv[2])
|
||||
assert(not dataa.mems[2])
|
||||
assert(not dataa.char[2])
|
||||
assert(not dataa.prin[2])
|
||||
assert(not dataa.obit[2]) -- make sure that there's only one element that fits the criteria
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.obit[1].attributes.href})
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.cond[1].attributes.href})
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.serv[1].attributes.href})
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.char[1].attributes.href})
|
||||
if dataa.prin[1] then
|
||||
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.prin[1].attributes.href})
|
||||
end
|
||||
QUEUED_URLS = true
|
||||
end
|
||||
if startswith(url, "https://www.tharpsontheimerfh.com/tributes/") then
|
||||
local ok=os.getenv("item_name")
|
||||
local root=htmlparser.parse(data)
|
||||
local div =root("#obitsbarV31")
|
||||
assert(not div[2])
|
||||
local oid=div[1].attributes["data-oid"]
|
||||
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/obpgsnvn", post_data="sn=tributewall&oid=" .. oid})
|
||||
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/prnobit", post_data="ok=" .. ok .. "&fcf=0&bg=1"})
|
||||
end
|
||||
io.stderr:write(table.show(addedUrls, "Added URLs "))
|
||||
return addedUrls
|
||||
end
|
||||
|
|
|
@ -0,0 +1,256 @@
|
|||
-- vim: ft=lua ts=2 sw=2
|
||||
|
||||
-- Syntactic Sugar {{{
|
||||
local function rine(val) -- Return (val) If it's Not Empty (non-zero-length)
|
||||
return (val and #val>0) and val
|
||||
end
|
||||
local function rit(a) -- Return (a) If it's Table
|
||||
return (type(a) == "table") and a
|
||||
end
|
||||
local noop = function() end
|
||||
local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
|
||||
local str = tostring
|
||||
local char = string.char
|
||||
local opts = rit(htmlparser_opts) or {} -- needed for silent/noerr/noout/nonl directives, also needed to be defined before `require` in such case
|
||||
local prn = opts.silent and noop or function(l,f,...)
|
||||
local fd = (l=="i") and "stdout" or "stderr"
|
||||
local t = (" [%s] "):format(l:upper())
|
||||
io[fd]
|
||||
:write('[HTMLParser]'..t..f:format(...)
|
||||
..(opts.nonl or "\n")
|
||||
)
|
||||
end
|
||||
local err = opts.noerr and noop or function(f,...) prn("e",f,...) end
|
||||
local out = opts.noout and noop or function(f,...) prn("i",f,...) end
|
||||
local line = debug and function(lvl) return debug.getinfo(lvl or 2).currentline end or noop
|
||||
local dbg = opts.debug and function(f,...) prn("d",f:gsub("#LINE#",str(line(3))),...) end or noop
|
||||
-- }}}
|
||||
-- Requires {{{
|
||||
local ElementNode = require"htmlparser.ElementNode"
|
||||
local voidelements = require"htmlparser.voidelements"
|
||||
--}}}
|
||||
local HtmlParser = {}
|
||||
local function parse(text,limit) -- {{{
|
||||
local opts = rine(opts) -- use top-level opts-table (the one, defined before requiring the module), if exists
|
||||
or rit(htmlparser_opts) -- or defined after requiring (but before calling `parse`)
|
||||
or {} -- fallback otherwise
|
||||
opts.looplimit = opts.looplimit or htmlparser_looplimit
|
||||
|
||||
local text = str(text)
|
||||
local limit = limit or opts.looplimit or 1000
|
||||
local tpl = false
|
||||
|
||||
if not opts.keep_comments then -- Strip (or not) comments {{{
|
||||
text = text:gsub("<!%-%-.-%-%->","") -- Many chances commented code will have syntax errors, that'll lead to parser failures
|
||||
end -- }}}
|
||||
|
||||
local tpr={}
|
||||
|
||||
if not opts.keep_danger_placeholders then -- {{{ little speedup by cost of potential parsing breakages
|
||||
-- search unused "invalid" bytes {{{
|
||||
local busy,i={},0;
|
||||
repeat -- {{{
|
||||
local cc = char(i)
|
||||
if not(text:match(cc)) then -- {{{
|
||||
if not(tpr["<"]) or not(tpr[">"]) then -- {{{
|
||||
if not(busy[i]) then -- {{{
|
||||
if not(tpr["<"]) then -- {{{
|
||||
tpr["<"] = cc;
|
||||
elseif not(tpr[">"]) then
|
||||
tpr[">"] = cc;
|
||||
end -- }}}
|
||||
busy[i] = true
|
||||
dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",str(c),cc:byte(),str(tpr[c]))
|
||||
dbg("busy[i]:{%s},i:{%d}",str(busy[i]),i)
|
||||
dbg("[FindPH]:#LINE# Success! || i=%d",i)
|
||||
else -- if !busy
|
||||
dbg("[FindPH]:#LINE# Busy! || i=%d",i)
|
||||
end -- if !busy -- }}}
|
||||
dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",c,cc:byte(),str(tpr[c]))
|
||||
dbg("%s",str(busy[i]))
|
||||
else -- if < or >
|
||||
dbg("[FindPH]:#LINE# Done!",i)
|
||||
break
|
||||
end -- if < or > -- }}}
|
||||
else -- text!match(cc)
|
||||
dbg("[FindPH]:#LINE# Text contains this byte! || i=%d",i)
|
||||
end -- text!match(cc) -- }}}
|
||||
local skip=1
|
||||
if i==31 then
|
||||
skip=96 -- ASCII
|
||||
end
|
||||
i=i+skip
|
||||
until (i==255) -- }}}
|
||||
i=nil
|
||||
--- }}}
|
||||
|
||||
if not(tpr["<"]) or not(tpr[">"]) then
|
||||
err("Impossible to find at least two unused byte codes in this HTML-code. We need it to escape bracket-contained placeholders inside tags.")
|
||||
err("Consider enabling 'keep_danger_placeholders' option (to silence this error, if parser wasn't failed with current HTML-code) or manually replace few random bytes, to free up the codes.")
|
||||
else
|
||||
dbg("[FindPH]:#LINE# Found! || '<'=%d, '>'=%d",tpr["<"]:byte(),tpr[">"]:byte())
|
||||
end
|
||||
|
||||
-- dbg("tpr[>] || tpr[] || #busy%d")
|
||||
|
||||
-- g {{{
|
||||
local function g(id,...)
|
||||
local arg={...}
|
||||
local orig=arg[id]
|
||||
arg[id]=arg[id]:gsub("(.)",tpr)
|
||||
if arg[id] ~= orig then
|
||||
tpl=true
|
||||
dbg("[g]:#LINE# orig: %s", str(orig))
|
||||
dbg("[g]:#LINE# replaced: %s",str(arg[id]))
|
||||
end
|
||||
dbg("[g]:#LINE# called, id: %s, arg[id]: %s, args { "..(("{%s}, "):rep(#arg):gsub(", $","")).." }",id,arg[id],...)
|
||||
dbg("[g]:#LINE# concat(arg): %s",table.concat(arg))
|
||||
return table.concat(arg)
|
||||
end
|
||||
-- g }}}
|
||||
|
||||
-- tpl-placeholders and attributes {{{
|
||||
text=text
|
||||
:gsub(
|
||||
"(=[%s]-)".. -- only match attr.values, and not random strings between two random apostrophs
|
||||
"(%b'')",
|
||||
function(...)return g(2,...)end
|
||||
)
|
||||
:gsub(
|
||||
"(=[%s]-)".. -- same for "
|
||||
'(%b"")',
|
||||
function(...)return g(2,...)end
|
||||
) -- Escape "<"/">" inside attr.values (see issue #50)
|
||||
:gsub(
|
||||
"(<".. -- Match "<",
|
||||
(opts.tpl_skip_pattern or "[^!]").. -- with exclusion pattern (for example, to ignore comments, which aren't template placeholders, but can legally contain "<"/">" inside.
|
||||
")([^>]+)".. -- If matched, we want to escape '<'s if we meet them inside tag
|
||||
"(>)",
|
||||
function(...)return g(2,...)end
|
||||
)
|
||||
:gsub(
|
||||
"("..
|
||||
(tpr["<"] or "__FAILED__").. -- Here we search for "<", we escaped in previous gsub (and don't break things if we have no escaping replacement)
|
||||
")("..
|
||||
(opts.tpl_marker_pattern or "[^%w%s]").. -- Capture templating symbol
|
||||
")([%g%s]-)".. -- match placeholder's content
|
||||
"(%2)(>)".. -- placeholder's tail
|
||||
"([^>]*>)", -- remainings
|
||||
function(...)return g(5,...)end
|
||||
)
|
||||
-- }}}
|
||||
end -- }}}
|
||||
|
||||
local index = 0
|
||||
local root = ElementNode:new(index, str(text))
|
||||
local node, descend, tpos, opentags = root, true, 1, {}
|
||||
|
||||
while true do -- MainLoop {{{
|
||||
if index == limit then -- {{{
|
||||
err("Main loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit)
|
||||
break
|
||||
end -- }}}
|
||||
-- openstart/tpos Definitions {{{
|
||||
local openstart, name
|
||||
openstart, tpos, name = root._text:find(
|
||||
"<" .. -- an uncaptured starting "<"
|
||||
"([%w-]+)" .. -- name = the first word, directly following the "<"
|
||||
"[^>]*>", -- include, but not capture everything up to the next ">"
|
||||
tpos)
|
||||
dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name))
|
||||
-- }}}
|
||||
if not name then break end
|
||||
-- Some more vars {{{
|
||||
index = index + 1
|
||||
local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos)
|
||||
node = tag
|
||||
local tagloop
|
||||
local tagst, apos = tag:gettext(), 1
|
||||
-- }}}
|
||||
while true do -- TagLoop {{{
|
||||
dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop))
|
||||
if tagloop == limit then -- {{{
|
||||
err("Tag parsing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit)
|
||||
break
|
||||
end -- }}}
|
||||
-- Attrs {{{
|
||||
local start, k, eq, quote, v, zsp
|
||||
start, apos, k, zsp, eq, zsp, quote = tagst:find(
|
||||
"%s+" .. -- some uncaptured space
|
||||
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
|
||||
"([%s]-)".. -- zero or more spaces
|
||||
"(=?)" .. -- eq = the optional; "=", else ""
|
||||
"([%s]-)".. -- zero or more spaces
|
||||
[=[(['"]?)]=], -- quote = an optional "'" or '"' following the "=", or ""
|
||||
apos)
|
||||
dbg("[TagLoop]:#LINE# start=%s || apos=%s || k=%s || zsp='%s' || eq='%s', quote=[%s]",str(start),str(apos),str(k),str(zsp),str(eq),str(quote))
|
||||
-- }}}
|
||||
if not k or k == "/>" or k == ">" then break end
|
||||
-- Pattern {{{
|
||||
if eq == "=" then
|
||||
local pattern = "=([^%s>]*)"
|
||||
if quote ~= "" then
|
||||
pattern = quote .. "([^" .. quote .. "]*)" .. quote
|
||||
end
|
||||
start, apos, v = tagst:find(pattern, apos)
|
||||
dbg("[TagLoop]:#LINE# start=%s || apos=%s || v=%s || pattern=%s",str(start),str(apos),str(v),str(pattern))
|
||||
end
|
||||
-- }}}
|
||||
v=v or ""
|
||||
if tpl then -- {{{
|
||||
for rk,rv in pairs(tpr) do
|
||||
v = v:gsub(rv,rk)
|
||||
dbg("[TagLoop]:#LINE# rv=%s || rk=%s",str(rv),str(rk))
|
||||
end
|
||||
end -- }}}
|
||||
|
||||
dbg("[TagLoop]:#LINE# k=%s || v=%s",str(k),str(v))
|
||||
tag:addattribute(k, v)
|
||||
tagloop = (tagloop or 0) + 1
|
||||
end
|
||||
-- }}}
|
||||
if voidelements[tag.name:lower()] then -- {{{
|
||||
descend = false
|
||||
tag:close()
|
||||
else
|
||||
descend = true
|
||||
opentags[tag.name] = opentags[tag.name] or {}
|
||||
table.insert(opentags[tag.name], tag)
|
||||
end
|
||||
-- }}}
|
||||
local closeend = tpos
|
||||
local closingloop
|
||||
while true do -- TagCloseLoop {{{
|
||||
-- Can't remember why did I add that, so comment it for now (and not remove), in case it will be needed again
|
||||
-- (although, it causes #59 and #60, so it will anyway be needed to rework)
|
||||
-- if voidelements[tag.name:lower()] then break end -- already closed
|
||||
if closingloop == limit then
|
||||
err("Tag closing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit)
|
||||
break
|
||||
end
|
||||
|
||||
local closestart, closing, closename
|
||||
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
|
||||
dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename))
|
||||
|
||||
if not closing or closing == "" then break end
|
||||
|
||||
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
|
||||
closestart = root._text:find("<", closestart)
|
||||
dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart))
|
||||
tag:close(closestart, closeend + 1)
|
||||
node = tag.parent
|
||||
descend = true
|
||||
closingloop = (closingloop or 0) + 1
|
||||
end -- }}}
|
||||
end -- }}}
|
||||
if tpl then -- {{{
|
||||
dbg("tpl")
|
||||
for k,v in pairs(tpr) do
|
||||
root._text = root._text:gsub(v,k)
|
||||
end
|
||||
end -- }}}
|
||||
return root
|
||||
end -- }}}
|
||||
HtmlParser.parse = parse
|
||||
return HtmlParser
|
|
@ -0,0 +1,283 @@
|
|||
-- vim: ft=lua ts=2
|
||||
local Set = {}
|
||||
Set.mt = {__index = Set}
|
||||
function Set:new(values)
|
||||
local instance = {}
|
||||
local isSet if getmetatable(values) == Set.mt then isSet = true end
|
||||
if type(values) == "table" then
|
||||
if not isSet and #values > 0 then
|
||||
for _,v in ipairs(values) do
|
||||
instance[v] = true
|
||||
end
|
||||
else
|
||||
for k in pairs(values) do
|
||||
instance[k] = true
|
||||
end
|
||||
end
|
||||
elseif values ~= nil then
|
||||
instance = {[values] = true}
|
||||
end
|
||||
return setmetatable(instance, Set.mt)
|
||||
end
|
||||
|
||||
function Set:add(e)
|
||||
if e ~= nil then self[e] = true end
|
||||
return self
|
||||
end
|
||||
|
||||
function Set:remove(e)
|
||||
if e ~= nil then self[e] = nil end
|
||||
return self
|
||||
end
|
||||
|
||||
function Set:tolist()
|
||||
local res = {}
|
||||
for k in pairs(self) do
|
||||
table.insert(res, k)
|
||||
end
|
||||
return res
|
||||
end
|
||||
|
||||
Set.mt.__add = function (a, b)
|
||||
local res, a, b = Set:new(), Set:new(a), Set:new(b)
|
||||
for k in pairs(a) do res[k] = true end
|
||||
for k in pairs(b) do res[k] = true end
|
||||
return res
|
||||
end
|
||||
|
||||
-- Subtraction
|
||||
Set.mt.__sub = function (a, b)
|
||||
local res, a, b = Set:new(), Set:new(a), Set:new(b)
|
||||
for k in pairs(a) do res[k] = true end
|
||||
for k in pairs(b) do res[k] = nil end
|
||||
return res
|
||||
end
|
||||
|
||||
-- Intersection
|
||||
Set.mt.__mul = function (a, b)
|
||||
local res, a, b = Set:new(), Set:new(a), Set:new(b)
|
||||
for k in pairs(a) do
|
||||
res[k] = b[k]
|
||||
end
|
||||
return res
|
||||
end
|
||||
|
||||
-- String representation
|
||||
Set.mt.__tostring = function (set)
|
||||
local s = "{"
|
||||
local sep = ""
|
||||
for k in pairs(set) do
|
||||
s = s .. sep .. tostring(k)
|
||||
sep = ", "
|
||||
end
|
||||
return s .. "}"
|
||||
end
|
||||
|
||||
|
||||
local ElementNode = {}
|
||||
ElementNode.mt = {__index = ElementNode}
|
||||
function ElementNode:new(index, nameortext, node, descend, openstart, openend)
|
||||
local instance = {
|
||||
index = index,
|
||||
name = nameortext,
|
||||
level = 0,
|
||||
parent = nil,
|
||||
root = nil,
|
||||
nodes = {},
|
||||
_openstart = openstart, _openend = openend,
|
||||
_closestart = openstart, _closeend = openend,
|
||||
attributes = {},
|
||||
id = nil,
|
||||
classes = {},
|
||||
deepernodes = Set:new(),
|
||||
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
|
||||
}
|
||||
if not node then
|
||||
instance.name = "root"
|
||||
instance.root = instance
|
||||
instance._text = nameortext
|
||||
local length = string.len(nameortext)
|
||||
instance._openstart, instance._openend = 1, length
|
||||
instance._closestart, instance._closeend = 1, length
|
||||
elseif descend then
|
||||
instance.root = node.root
|
||||
instance.parent = node
|
||||
instance.level = node.level + 1
|
||||
table.insert(node.nodes, instance)
|
||||
else
|
||||
instance.root = node.root
|
||||
instance.parent = node.parent or node --XXX: adds some safety but needs more testing for heisenbugs in corner cases
|
||||
instance.level = node.level
|
||||
table.insert((node.parent and node.parent.nodes or node.nodes), instance) --XXX: see above about heisenbugs
|
||||
end
|
||||
return setmetatable(instance, ElementNode.mt)
|
||||
end
|
||||
|
||||
function ElementNode:gettext()
|
||||
return string.sub(self.root._text, self._openstart, self._closeend)
|
||||
end
|
||||
|
||||
function ElementNode:settext(c)
|
||||
self.root._text=c
|
||||
end
|
||||
|
||||
function ElementNode:textonly()
|
||||
return (self:gettext():gsub("<[^>]*>",""))
|
||||
end
|
||||
|
||||
function ElementNode:getcontent()
|
||||
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
|
||||
end
|
||||
|
||||
function ElementNode:addattribute(k, v)
|
||||
self.attributes[k] = v
|
||||
if string.lower(k) == "id" then
|
||||
self.id = v
|
||||
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
|
||||
elseif string.lower(k) == "class" then
|
||||
for class in string.gmatch(v, "%S+") do
|
||||
table.insert(self.classes, class)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
local function insert(table, name, node)
|
||||
table[name] = table[name] or Set:new()
|
||||
table[name]:add(node)
|
||||
end
|
||||
|
||||
function ElementNode:close(closestart, closeend)
|
||||
if closestart and closeend then
|
||||
self._closestart, self._closeend = closestart, closeend
|
||||
end
|
||||
-- inform hihger level nodes about this element's existence in their branches
|
||||
local node = self
|
||||
while true do
|
||||
node = node.parent
|
||||
if not node then break end
|
||||
node.deepernodes:add(self)
|
||||
insert(node.deeperelements, self.name, self)
|
||||
for k in pairs(self.attributes) do
|
||||
insert(node.deeperattributes, k, self)
|
||||
end
|
||||
if self.id then
|
||||
insert(node.deeperids, self.id, self)
|
||||
end
|
||||
for _,v in ipairs(self.classes) do
|
||||
insert(node.deeperclasses, v, self)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
local function escape(s)
|
||||
-- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
|
||||
return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
|
||||
end
|
||||
|
||||
local function select(self, s)
|
||||
if not s or type(s) ~= "string" or s == "" then return Set:new() end
|
||||
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
|
||||
["#"] = self.deeperids, ["."] = self.deeperclasses}
|
||||
local function match(t, w)
|
||||
local m, e, v
|
||||
if t == "[" then w, m, e, v = string.match(w,
|
||||
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
|
||||
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
|
||||
"(=?)" .. -- e = the optional "="
|
||||
"(.*)" -- v = anything following the "=", or else ""
|
||||
)
|
||||
end
|
||||
local matched = Set:new(sets[t][w])
|
||||
-- attribute value selectors
|
||||
if e == "=" then
|
||||
if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
|
||||
v = string.sub(v, 2, #v - 1) -- strip quotes
|
||||
if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
|
||||
for node in pairs(matched) do
|
||||
local a = node.attributes[w]
|
||||
-- equals
|
||||
if m == "" and a ~= v then matched:remove(node)
|
||||
-- not equals
|
||||
elseif m == "!" and a == v then matched:remove(node)
|
||||
-- prefix
|
||||
elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
|
||||
-- contains
|
||||
elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
|
||||
-- word
|
||||
elseif m =="~" then matched:remove(node)
|
||||
for word in string.gmatch(a, "%S+") do
|
||||
if word == v then matched:add(node) break end
|
||||
end
|
||||
-- starts with
|
||||
elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
|
||||
-- ends with
|
||||
elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
|
||||
end
|
||||
end -- for node
|
||||
end -- if v
|
||||
return matched
|
||||
end
|
||||
|
||||
local subjects, resultset, childrenonly = Set:new({self})
|
||||
for part in string.gmatch(s, "%S+") do
|
||||
repeat
|
||||
if part == ">" then childrenonly = true --[[goto nextpart]] break end
|
||||
resultset = Set:new()
|
||||
for subject in pairs(subjects) do
|
||||
local star = subject.deepernodes
|
||||
if childrenonly then star = Set:new(subject.nodes) end
|
||||
resultset = resultset + star
|
||||
end
|
||||
childrenonly = false
|
||||
if part == "*" then --[[goto nextpart]] break end
|
||||
local excludes, filter = Set:new()
|
||||
local start, pos = 0, 0
|
||||
while true do
|
||||
local switch, stype, name, eq, quote
|
||||
start, pos, switch, stype, name, eq, quote = string.find(part,
|
||||
"(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
|
||||
"([:%[#.]?)" .. -- stype = a possible :, [, #, or .
|
||||
"([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore)
|
||||
"([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
|
||||
"(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
|
||||
pos + 1
|
||||
)
|
||||
if not name then break end
|
||||
repeat
|
||||
if ":" == stype then
|
||||
filter = name
|
||||
--[[goto nextname]] break
|
||||
end
|
||||
if ")" == switch then
|
||||
filter = nil
|
||||
end
|
||||
if "[" == stype and "" ~= quote then
|
||||
local value
|
||||
start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
|
||||
name = name .. eq .. value
|
||||
end
|
||||
local matched = match(stype, name)
|
||||
if filter == "not" then
|
||||
excludes = excludes + matched
|
||||
else
|
||||
resultset = resultset * matched
|
||||
end
|
||||
--::nextname::
|
||||
break
|
||||
until true
|
||||
end
|
||||
resultset = resultset - excludes
|
||||
subjects = Set:new(resultset)
|
||||
--::nextpart::
|
||||
break
|
||||
until true
|
||||
end
|
||||
resultset = resultset:tolist()
|
||||
table.sort(resultset, function (a, b) return a.index < b.index end)
|
||||
return resultset
|
||||
end
|
||||
|
||||
function ElementNode:select(s) return select(self, s) end
|
||||
ElementNode.mt.__call = select
|
||||
|
||||
return ElementNode
|
|
@ -0,0 +1,19 @@
|
|||
-- vim: ft=lua ts=2
|
||||
return {
|
||||
area = true,
|
||||
base = true,
|
||||
br = true,
|
||||
col = true,
|
||||
command = true,
|
||||
embed = true,
|
||||
hr = true,
|
||||
img = true,
|
||||
input = true,
|
||||
keygen = true,
|
||||
link = true,
|
||||
meta = true,
|
||||
param = true,
|
||||
source = true,
|
||||
track = true,
|
||||
wbr = true
|
||||
}
|
47
pipeline.py
47
pipeline.py
|
@ -18,12 +18,14 @@ import hashlib
|
|||
import shutil
|
||||
import socket
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
|
||||
project = Project(
|
||||
title = "Geometry Dash",
|
||||
title = "No",
|
||||
project_html = """
|
||||
<h2>Geometry Dash</h2>
|
||||
<p>Time to archive Geometry Dash?</p>
|
||||
<h2>Funeral homes</h2>
|
||||
<p>Archiving funeral homes, because who else will?</p>
|
||||
""",
|
||||
)
|
||||
|
||||
|
@ -34,7 +36,7 @@ project = Project(
|
|||
# It will be added to the WARC files and reported to the tracker.
|
||||
VERSION = '20220428.01'
|
||||
#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
|
||||
TRACKER_ID = 'geometrytrash'
|
||||
TRACKER_ID = 'funeralhomestest'
|
||||
TRACKER_HOST = '172.17.0.1:8501'
|
||||
|
||||
WGET_AT = find_executable(
|
||||
|
@ -118,13 +120,11 @@ def get_hash(filename):
|
|||
CWD = os.getcwd()
|
||||
PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))
|
||||
LUA_SHA1 = get_hash(os.path.join(CWD, 'grab.lua'))
|
||||
GMD_LUA_SHA1 = get_hash(os.path.join(CWD, 'gmd.lua'))
|
||||
|
||||
def stats_id_function(item):
|
||||
d = {
|
||||
'pipeline_hash': PIPELINE_SHA1,
|
||||
'lua_hash': LUA_SHA1,
|
||||
'gmd_lua_hash': GMD_LUA_SHA1,
|
||||
'python_version': sys.version,
|
||||
}
|
||||
|
||||
|
@ -135,18 +135,10 @@ class MoveFiles(SimpleTask):
|
|||
SimpleTask.__init__(self, 'MoveFiles')
|
||||
|
||||
def process(self, item):
|
||||
os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item,
|
||||
'%(data_dir)s/%(warc_file_base)s.warc.gz' % item)
|
||||
|
||||
shutil.rmtree('%(item_dir)s' % item)
|
||||
|
||||
class AwfulBackfeed(SimpleTask):
|
||||
def __init__(self):
|
||||
SimpleTask.__init__(self, 'AwfulBackfeed')
|
||||
|
||||
def process(self, item):
|
||||
with open('%(item_dir)s/new_items' % item) as file:
|
||||
new_items = file.read()
|
||||
item["ts"] = time.time()
|
||||
item["dd"] = item["data_dir"].lstrip("grab/data/")
|
||||
shutil.move('%(item_dir)s/' % item,
|
||||
'/finished/%(dd)s_%(item_name)s_%(ts)s/' % item)
|
||||
|
||||
class WgetArgs(object):
|
||||
def realize(self, item):
|
||||
|
@ -162,18 +154,22 @@ class WgetArgs(object):
|
|||
'--truncate-output',
|
||||
'-e', 'robots=off',
|
||||
'--rotate-dns',
|
||||
'--page-requisites',
|
||||
'--timeout', '10',
|
||||
'--tries', '10',
|
||||
'--span-hosts',
|
||||
'--waitretry', '5000',
|
||||
'--waitretry', '0',
|
||||
'-w', '1',
|
||||
'--random-wait',
|
||||
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
|
||||
'--warc-header', 'operator: TheTechRobo <thetechrobo@protonmail.ch>',
|
||||
'--warc-header', json.dumps(stats_id_function(item)),
|
||||
'--warc-header', 'x-wget-at-project-version: ' + VERSION,
|
||||
'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
|
||||
'--warc-dedup-url-agnostic',
|
||||
'--header', 'Contact: Discord TheTechRobo#7420',
|
||||
'--header', 'Connection: keep-alive',
|
||||
'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8'
|
||||
'-U', 'Mozilla/5.0 (Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0 ; Operator: TheTechRobo thetechrobo@protonmail.ch',
|
||||
]
|
||||
|
||||
item['item_name_newline'] = item['item_name'].replace('\0', '\n')
|
||||
|
@ -183,8 +179,13 @@ class WgetArgs(object):
|
|||
for item_name in item['item_name'].split('\0'):
|
||||
wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
|
||||
wget_args.append('item-name://'+item_name)
|
||||
item_urls.append(item_name)
|
||||
wget_args.append(item_name)
|
||||
i_n = item_name.split(':')
|
||||
if i_n[0] == 'downsandson':
|
||||
item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html']
|
||||
if i_n[0] == 'tharpsontheimerfh':
|
||||
item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b']
|
||||
item_urls+=(item_name)
|
||||
wget_args+=(item_name)
|
||||
|
||||
item['item_urls'] = item_urls
|
||||
item['custom_items'] = json.dumps(custom_items)
|
||||
|
@ -203,7 +204,7 @@ pipeline = Pipeline(
|
|||
GetItemFromTracker('http://{}/{}'
|
||||
.format(TRACKER_HOST, TRACKER_ID),
|
||||
downloader, VERSION),
|
||||
PrepareDirectories(warc_prefix='gmd'),
|
||||
PrepareDirectories(warc_prefix='funeralhome'),
|
||||
WgetDownload(
|
||||
WgetArgs(),
|
||||
max_tries=1,
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
--[[
|
||||
Author: Julio Manuel Fernandez-Diaz
|
||||
Date: January 12, 2007
|
||||
(For Lua 5.1)
|
||||
|
||||
Modified slightly by RiciLake to avoid the unnecessary table traversal in tablecount()
|
||||
|
||||
Formats tables with cycles recursively to any depth.
|
||||
The output is returned as a string.
|
||||
References to other tables are shown as values.
|
||||
Self references are indicated.
|
||||
|
||||
The string returned is "Lua code", which can be procesed
|
||||
(in the case in which indent is composed by spaces or "--").
|
||||
Userdata and function keys and values are shown as strings,
|
||||
which logically are exactly not equivalent to the original code.
|
||||
|
||||
This routine can serve for pretty formating tables with
|
||||
proper indentations, apart from printing them:
|
||||
|
||||
print(table.show(t, "t")) -- a typical use
|
||||
|
||||
Heavily based on "Saving tables with cycles", PIL2, p. 113.
|
||||
|
||||
Arguments:
|
||||
t is the table.
|
||||
name is the name of the table (optional)
|
||||
indent is a first indentation (optional).
|
||||
--]]
|
||||
function table.show(t, name, indent)
|
||||
local cart -- a container
|
||||
local autoref -- for self references
|
||||
|
||||
--[[ counts the number of elements in a table
|
||||
local function tablecount(t)
|
||||
local n = 0
|
||||
for _, _ in pairs(t) do n = n+1 end
|
||||
return n
|
||||
end
|
||||
]]
|
||||
-- (RiciLake) returns true if the table is empty
|
||||
local function isemptytable(t) return next(t) == nil end
|
||||
|
||||
local function basicSerialize (o)
|
||||
local so = tostring(o)
|
||||
if type(o) == "function" then
|
||||
local info = debug.getinfo(o, "S")
|
||||
-- info.name is nil because o is not a calling level
|
||||
if info.what == "C" then
|
||||
return string.format("%q", so .. ", C function")
|
||||
else
|
||||
-- the information is defined through lines
|
||||
return string.format("%q", so .. ", defined in (" ..
|
||||
info.linedefined .. "-" .. info.lastlinedefined ..
|
||||
")" .. info.source)
|
||||
end
|
||||
elseif type(o) == "number" or type(o) == "boolean" then
|
||||
return so
|
||||
else
|
||||
return string.format("%q", so)
|
||||
end
|
||||
end
|
||||
|
||||
local function addtocart (value, name, indent, saved, field)
|
||||
indent = indent or ""
|
||||
saved = saved or {}
|
||||
field = field or name
|
||||
|
||||
cart = cart .. indent .. field
|
||||
|
||||
if type(value) ~= "table" then
|
||||
cart = cart .. " = " .. basicSerialize(value) .. ";\n"
|
||||
else
|
||||
if saved[value] then
|
||||
cart = cart .. " = {}; -- " .. saved[value]
|
||||
.. " (self reference)\n"
|
||||
autoref = autoref .. name .. " = " .. saved[value] .. ";\n"
|
||||
else
|
||||
saved[value] = name
|
||||
--if tablecount(value) == 0 then
|
||||
if isemptytable(value) then
|
||||
cart = cart .. " = {};\n"
|
||||
else
|
||||
cart = cart .. " = {\n"
|
||||
for k, v in pairs(value) do
|
||||
k = basicSerialize(k)
|
||||
local fname = string.format("%s[%s]", name, k)
|
||||
field = string.format("[%s]", k)
|
||||
-- three spaces between levels
|
||||
addtocart(v, fname, indent .. " ", saved, field)
|
||||
end
|
||||
cart = cart .. indent .. "};\n"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
name = name or "__unnamed__"
|
||||
if type(t) ~= "table" then
|
||||
return name .. " = " .. basicSerialize(t)
|
||||
end
|
||||
cart, autoref = "", ""
|
||||
addtocart(t, name, indent)
|
||||
return cart .. autoref
|
||||
end
|
||||
|
Loading…
Reference in New Issue