All done.

Seems to work.

Discovery for TharpsonTheimer hasn't been done yet.
This commit is contained in:
TheTechRobo 2022-05-16 22:16:55 -04:00
parent 4879afb63d
commit 8df93e1d56
8 changed files with 740 additions and 180 deletions

View File

@ -1,7 +1,7 @@
run:
make clean
docker build -t img .
docker run --rm img test
docker run -v "/media/thetechrobo/2tb/obitdata:/finished" --rm img TheTechRobo --concurrent 1
clean:
rm -rf img

139
gmd.lua
View File

@ -1,139 +0,0 @@
-- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua
function split(s, sep)
local fields = {}
local sep = sep or " "
local pattern = string.format("([^%s]+)", sep)
string.gsub(s, pattern, function(c) fields[#fields + 1] = c end)
return fields
end
-- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua
--
GMD = {}
GMD["comments"] = {}
function table.show(t, name, indent)
local cart -- a container
local autoref -- for self references
--[[ counts the number of elements in a table
local function tablecount(t)
local n = 0
for _, _ in pairs(t) do n = n+1 end
return n
end
]]
-- (RiciLake) returns true if the table is empty
local function isemptytable(t) return next(t) == nil end
local function basicSerialize (o)
local so = tostring(o)
if type(o) == "function" then
local info = debug.getinfo(o, "S")
-- info.name is nil because o is not a calling level
if info.what == "C" then
return string.format("%q", so .. ", C function")
else
-- the information is defined through lines
return string.format("%q", so .. ", defined in (" ..
info.linedefined .. "-" .. info.lastlinedefined ..
")" .. info.source)
end
elseif type(o) == "number" or type(o) == "boolean" then
return so
else
return string.format("%q", so)
end
end
local function addtocart (value, name, indent, saved, field)
indent = indent or ""
saved = saved or {}
field = field or name
cart = cart .. indent .. field
if type(value) ~= "table" then
cart = cart .. " = " .. basicSerialize(value) .. ";\n"
else
if saved[value] then
cart = cart .. " = {}; -- " .. saved[value]
.. " (self reference)\n"
autoref = autoref .. name .. " = " .. saved[value] .. ";\n"
else
saved[value] = name
--if tablecount(value) == 0 then
if isemptytable(value) then
cart = cart .. " = {};\n"
else
cart = cart .. " = {\n"
for k, v in pairs(value) do
k = basicSerialize(k)
local fname = string.format("%s[%s]", name, k)
field = string.format("[%s]", k)
-- three spaces between levels
addtocart(v, fname, indent .. " ", saved, field)
end
cart = cart .. indent .. "};\n"
end
end
end
end
name = name or "__unnamed__"
if type(t) ~= "table" then
return name .. " = " .. basicSerialize(t)
end
cart, autoref = "", ""
addtocart(t, name, indent)
return cart .. autoref
end
GMD.comments.mapping = {"levelID","comment","authorPlayerID","likes","dislikes","messageID","spam","authorAccountID","age","percent","modBadge","moderatorChatColor"} -- https://docs.gdprogra.me/#/resources/server/comment
GMD["comments"]["parse"] = function(comments)
local comment = comments
local splitted = split(comment, ":")
if not splitted[2] then
return false
end
local retern = {}
retern.comment = splitted[1]
retern.account = splitted[2]
retern.parsed = {}
retern.parsed.comment = {}
local data = split(retern.comment, "|")
for i=1, #data do
retern.parsed.comment[i] = {}
-- comment parser
local ndata = split(data[i], "~")
for j=1, #ndata do
if not (j % 2 == 0) then -- key
key = ndata[j]
key = GMD.comments.mapping[tonumber(key)]
else -- value
local value = ndata[j]
retern.parsed.comment[i][key] = value
end
end
end
return retern
end
GMD["comments"]["getOneComment"] = function (self, comments, pos)
local parsed = self.parse(comments)
if not parsed then
return false
else
return parsed.parsed.comment[pos or 1]
end
end
function GMDtest()
local strin = "2~NzUwMCBzdGFycyBjOg==~4~3~9~1 month~6~1803945|2~SSBiZWF0IDYgaW5zYW5lIGRlbW9ucyBpbiAyNCBob3VycyBsbWFvOiBOZWNyb3BvbGlzLCBUaGUgQ2F2ZXJucyBJSSwgRWxlbWVudHMgWCwgWCBBZHZlbnR1cmUsIFNhZGlzbSwgYW5kIEJsYXN0ZXIgYzo=~4~21~9~8 months~6~1793260|2~L1wvXC9cIDwz~4~6~9~1 year~6~1785414|2~U2VudCBmcm9tIGlPUyBTaG9ydGN1dHMh~4~8~9~1 year~6~1776426|2~VGhpcyBjb21tZW50IHdhcyB1cGxvYWRlZCBmb3IgdGhlIEdEIERvY3Mh~4~5~9~1 year~6~1772719|2~VGhlIHRyaWxvZ3kgaGFzIGJlZW4gY29tcGxldGVkLi4uR0cgQWZ0ZXJtYXRoIQ==~4~8~9~1 year~6~1766450|2~Im93byIgLSBGb3VuZG15YmFsbA==~4~4~9~1 year~6~1766338|2~NTAwMCBzdGFycyE=~4~12~9~2 years~6~1756926|2~Qmxvb2RiYXRoIEdHISEh~4~24~9~2 years~6~1745624|2~QWxsZWdpYW5jZSAxMDAl~4~3~9~2 years~6~1744292#73:0:10"
assert(GMD["comments"]:getOneComment(strin)["comment"] == "NzUwMCBzdGFycyBjOg==")
assert(not GMD["comments"]:getOneComment("-1"))
end
GMDtest()

View File

@ -1,6 +1,5 @@
require "gmd"
NEW_ITEMS = {}
local htmlparser = require "htmlparser"
require "table_show"
function readAll(file)
local f = assert(io.open(file, "rb"))
@ -9,19 +8,54 @@ function readAll(file)
return content
end
-- print(table.show(false))
wget.callbacks.httploop_result = function(url, err, http_stat)
local data = readAll(http_stat.local_file)
-- Time to make sure that it's a valid response.
local result = GMD.comments.parse(data)
if result then
return wget.actions.NOTHING
else
io.stderr:write("\aYou've been IP-banned from Geometry Dash's servers. Sorry about that.\n")
io.stderr:write("Please let us know in #geometrytrash on hackint!\n")
io.stderr:write("Sleeping 69 seconds. (nice)\n")
os.execute("sleep 69")
return wget.actions.ABORT -- We've been banned
end
QUEUED_URLS = false
function startswith(text, prefix)
return text:find(prefix, 1, true) == 1
end
wget.callbacks.httploop_result = function(url, err, http_stat)
io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n")
end
wget.callbacks.get_urls = function(file, url, is_css, iri)
local addedUrls = {}
local data = readAll(file)
io.stderr:write("Read data\n")
if url:match("https://downsandsonfuneralhome%.com/tribute/details/[^/]+/Dr%-Alex%-Klym/obituary%.html") then
local root = htmlparser.parse(data)
io.stderr:write("Read root\n")
local dataa ={}
dataa.obit = root("#obituary-link-list-item a")
dataa.cond = root("#condolences-link-list-item a")
dataa.serv = root("#service-link-list-item a")
dataa.mems = root("#memories-link-list-item a")
dataa.char = root("#charities-link-list-item a")
dataa.prin = root(".print-obit-btn a")
assert(not dataa.cond[2])
assert(not dataa.serv[2])
assert(not dataa.mems[2])
assert(not dataa.char[2])
assert(not dataa.prin[2])
assert(not dataa.obit[2]) -- make sure that there's only one element that fits the criteria
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.obit[1].attributes.href})
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.cond[1].attributes.href})
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.serv[1].attributes.href})
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.char[1].attributes.href})
if dataa.prin[1] then
table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.prin[1].attributes.href})
end
QUEUED_URLS = true
end
if startswith(url, "https://www.tharpsontheimerfh.com/tributes/") then
local ok=os.getenv("item_name")
local root=htmlparser.parse(data)
local div =root("#obitsbarV31")
assert(not div[2])
local oid=div[1].attributes["data-oid"]
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/obpgsnvn", post_data="sn=tributewall&oid=" .. oid})
table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/prnobit", post_data="ok=" .. ok .. "&fcf=0&bg=1"})
end
io.stderr:write(table.show(addedUrls, "Added URLs "))
return addedUrls
end

256
htmlparser.lua Normal file
View File

@ -0,0 +1,256 @@
-- vim: ft=lua ts=2 sw=2
-- Syntactic Sugar {{{
local function rine(val) -- Return (val) If it's Not Empty (non-zero-length)
return (val and #val>0) and val
end
local function rit(a) -- Return (a) If it's Table
return (type(a) == "table") and a
end
local noop = function() end
local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
local str = tostring
local char = string.char
local opts = rit(htmlparser_opts) or {} -- needed for silent/noerr/noout/nonl directives, also needed to be defined before `require` in such case
local prn = opts.silent and noop or function(l,f,...)
local fd = (l=="i") and "stdout" or "stderr"
local t = (" [%s] "):format(l:upper())
io[fd]
:write('[HTMLParser]'..t..f:format(...)
..(opts.nonl or "\n")
)
end
local err = opts.noerr and noop or function(f,...) prn("e",f,...) end
local out = opts.noout and noop or function(f,...) prn("i",f,...) end
local line = debug and function(lvl) return debug.getinfo(lvl or 2).currentline end or noop
local dbg = opts.debug and function(f,...) prn("d",f:gsub("#LINE#",str(line(3))),...) end or noop
-- }}}
-- Requires {{{
local ElementNode = require"htmlparser.ElementNode"
local voidelements = require"htmlparser.voidelements"
--}}}
local HtmlParser = {}
local function parse(text,limit) -- {{{
local opts = rine(opts) -- use top-level opts-table (the one, defined before requiring the module), if exists
or rit(htmlparser_opts) -- or defined after requiring (but before calling `parse`)
or {} -- fallback otherwise
opts.looplimit = opts.looplimit or htmlparser_looplimit
local text = str(text)
local limit = limit or opts.looplimit or 1000
local tpl = false
if not opts.keep_comments then -- Strip (or not) comments {{{
text = text:gsub("<!%-%-.-%-%->","") -- Many chances commented code will have syntax errors, that'll lead to parser failures
end -- }}}
local tpr={}
if not opts.keep_danger_placeholders then -- {{{ little speedup by cost of potential parsing breakages
-- search unused "invalid" bytes {{{
local busy,i={},0;
repeat -- {{{
local cc = char(i)
if not(text:match(cc)) then -- {{{
if not(tpr["<"]) or not(tpr[">"]) then -- {{{
if not(busy[i]) then -- {{{
if not(tpr["<"]) then -- {{{
tpr["<"] = cc;
elseif not(tpr[">"]) then
tpr[">"] = cc;
end -- }}}
busy[i] = true
dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",str(c),cc:byte(),str(tpr[c]))
dbg("busy[i]:{%s},i:{%d}",str(busy[i]),i)
dbg("[FindPH]:#LINE# Success! || i=%d",i)
else -- if !busy
dbg("[FindPH]:#LINE# Busy! || i=%d",i)
end -- if !busy -- }}}
dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",c,cc:byte(),str(tpr[c]))
dbg("%s",str(busy[i]))
else -- if < or >
dbg("[FindPH]:#LINE# Done!",i)
break
end -- if < or > -- }}}
else -- text!match(cc)
dbg("[FindPH]:#LINE# Text contains this byte! || i=%d",i)
end -- text!match(cc) -- }}}
local skip=1
if i==31 then
skip=96 -- ASCII
end
i=i+skip
until (i==255) -- }}}
i=nil
--- }}}
if not(tpr["<"]) or not(tpr[">"]) then
err("Impossible to find at least two unused byte codes in this HTML-code. We need it to escape bracket-contained placeholders inside tags.")
err("Consider enabling 'keep_danger_placeholders' option (to silence this error, if parser wasn't failed with current HTML-code) or manually replace few random bytes, to free up the codes.")
else
dbg("[FindPH]:#LINE# Found! || '<'=%d, '>'=%d",tpr["<"]:byte(),tpr[">"]:byte())
end
-- dbg("tpr[>] || tpr[] || #busy%d")
-- g {{{
local function g(id,...)
local arg={...}
local orig=arg[id]
arg[id]=arg[id]:gsub("(.)",tpr)
if arg[id] ~= orig then
tpl=true
dbg("[g]:#LINE# orig: %s", str(orig))
dbg("[g]:#LINE# replaced: %s",str(arg[id]))
end
dbg("[g]:#LINE# called, id: %s, arg[id]: %s, args { "..(("{%s}, "):rep(#arg):gsub(", $","")).." }",id,arg[id],...)
dbg("[g]:#LINE# concat(arg): %s",table.concat(arg))
return table.concat(arg)
end
-- g }}}
-- tpl-placeholders and attributes {{{
text=text
:gsub(
"(=[%s]-)".. -- only match attr.values, and not random strings between two random apostrophs
"(%b'')",
function(...)return g(2,...)end
)
:gsub(
"(=[%s]-)".. -- same for "
'(%b"")',
function(...)return g(2,...)end
) -- Escape "<"/">" inside attr.values (see issue #50)
:gsub(
"(<".. -- Match "<",
(opts.tpl_skip_pattern or "[^!]").. -- with exclusion pattern (for example, to ignore comments, which aren't template placeholders, but can legally contain "<"/">" inside.
")([^>]+)".. -- If matched, we want to escape '<'s if we meet them inside tag
"(>)",
function(...)return g(2,...)end
)
:gsub(
"("..
(tpr["<"] or "__FAILED__").. -- Here we search for "<", we escaped in previous gsub (and don't break things if we have no escaping replacement)
")("..
(opts.tpl_marker_pattern or "[^%w%s]").. -- Capture templating symbol
")([%g%s]-)".. -- match placeholder's content
"(%2)(>)".. -- placeholder's tail
"([^>]*>)", -- remainings
function(...)return g(5,...)end
)
-- }}}
end -- }}}
local index = 0
local root = ElementNode:new(index, str(text))
local node, descend, tpos, opentags = root, true, 1, {}
while true do -- MainLoop {{{
if index == limit then -- {{{
err("Main loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit)
break
end -- }}}
-- openstart/tpos Definitions {{{
local openstart, name
openstart, tpos, name = root._text:find(
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name))
-- }}}
if not name then break end
-- Some more vars {{{
index = index + 1
local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos)
node = tag
local tagloop
local tagst, apos = tag:gettext(), 1
-- }}}
while true do -- TagLoop {{{
dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop))
if tagloop == limit then -- {{{
err("Tag parsing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit)
break
end -- }}}
-- Attrs {{{
local start, k, eq, quote, v, zsp
start, apos, k, zsp, eq, zsp, quote = tagst:find(
"%s+" .. -- some uncaptured space
"([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
"([%s]-)".. -- zero or more spaces
"(=?)" .. -- eq = the optional; "=", else ""
"([%s]-)".. -- zero or more spaces
[=[(['"]?)]=], -- quote = an optional "'" or '"' following the "=", or ""
apos)
dbg("[TagLoop]:#LINE# start=%s || apos=%s || k=%s || zsp='%s' || eq='%s', quote=[%s]",str(start),str(apos),str(k),str(zsp),str(eq),str(quote))
-- }}}
if not k or k == "/>" or k == ">" then break end
-- Pattern {{{
if eq == "=" then
local pattern = "=([^%s>]*)"
if quote ~= "" then
pattern = quote .. "([^" .. quote .. "]*)" .. quote
end
start, apos, v = tagst:find(pattern, apos)
dbg("[TagLoop]:#LINE# start=%s || apos=%s || v=%s || pattern=%s",str(start),str(apos),str(v),str(pattern))
end
-- }}}
v=v or ""
if tpl then -- {{{
for rk,rv in pairs(tpr) do
v = v:gsub(rv,rk)
dbg("[TagLoop]:#LINE# rv=%s || rk=%s",str(rv),str(rk))
end
end -- }}}
dbg("[TagLoop]:#LINE# k=%s || v=%s",str(k),str(v))
tag:addattribute(k, v)
tagloop = (tagloop or 0) + 1
end
-- }}}
if voidelements[tag.name:lower()] then -- {{{
descend = false
tag:close()
else
descend = true
opentags[tag.name] = opentags[tag.name] or {}
table.insert(opentags[tag.name], tag)
end
-- }}}
local closeend = tpos
local closingloop
while true do -- TagCloseLoop {{{
-- Can't remember why did I add that, so comment it for now (and not remove), in case it will be needed again
-- (although, it causes #59 and #60, so it will anyway be needed to rework)
-- if voidelements[tag.name:lower()] then break end -- already closed
if closingloop == limit then
err("Tag closing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit)
break
end
local closestart, closing, closename
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename))
if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = root._text:find("<", closestart)
dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart))
tag:close(closestart, closeend + 1)
node = tag.parent
descend = true
closingloop = (closingloop or 0) + 1
end -- }}}
end -- }}}
if tpl then -- {{{
dbg("tpl")
for k,v in pairs(tpr) do
root._text = root._text:gsub(v,k)
end
end -- }}}
return root
end -- }}}
HtmlParser.parse = parse
return HtmlParser

283
htmlparser/ElementNode.lua Normal file
View File

@ -0,0 +1,283 @@
-- vim: ft=lua ts=2
local Set = {}
Set.mt = {__index = Set}
function Set:new(values)
local instance = {}
local isSet if getmetatable(values) == Set.mt then isSet = true end
if type(values) == "table" then
if not isSet and #values > 0 then
for _,v in ipairs(values) do
instance[v] = true
end
else
for k in pairs(values) do
instance[k] = true
end
end
elseif values ~= nil then
instance = {[values] = true}
end
return setmetatable(instance, Set.mt)
end
function Set:add(e)
if e ~= nil then self[e] = true end
return self
end
function Set:remove(e)
if e ~= nil then self[e] = nil end
return self
end
function Set:tolist()
local res = {}
for k in pairs(self) do
table.insert(res, k)
end
return res
end
Set.mt.__add = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = true end
return res
end
-- Subtraction
Set.mt.__sub = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do res[k] = true end
for k in pairs(b) do res[k] = nil end
return res
end
-- Intersection
Set.mt.__mul = function (a, b)
local res, a, b = Set:new(), Set:new(a), Set:new(b)
for k in pairs(a) do
res[k] = b[k]
end
return res
end
-- String representation
Set.mt.__tostring = function (set)
local s = "{"
local sep = ""
for k in pairs(set) do
s = s .. sep .. tostring(k)
sep = ", "
end
return s .. "}"
end
local ElementNode = {}
ElementNode.mt = {__index = ElementNode}
function ElementNode:new(index, nameortext, node, descend, openstart, openend)
local instance = {
index = index,
name = nameortext,
level = 0,
parent = nil,
root = nil,
nodes = {},
_openstart = openstart, _openend = openend,
_closestart = openstart, _closeend = openend,
attributes = {},
id = nil,
classes = {},
deepernodes = Set:new(),
deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
}
if not node then
instance.name = "root"
instance.root = instance
instance._text = nameortext
local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length
elseif descend then
instance.root = node.root
instance.parent = node
instance.level = node.level + 1
table.insert(node.nodes, instance)
else
instance.root = node.root
instance.parent = node.parent or node --XXX: adds some safety but needs more testing for heisenbugs in corner cases
instance.level = node.level
table.insert((node.parent and node.parent.nodes or node.nodes), instance) --XXX: see above about heisenbugs
end
return setmetatable(instance, ElementNode.mt)
end
function ElementNode:gettext()
return string.sub(self.root._text, self._openstart, self._closeend)
end
function ElementNode:settext(c)
self.root._text=c
end
function ElementNode:textonly()
return (self:gettext():gsub("<[^>]*>",""))
end
function ElementNode:getcontent()
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
end
function ElementNode:addattribute(k, v)
self.attributes[k] = v
if string.lower(k) == "id" then
self.id = v
-- class attribute contains "space-separated tokens", each of which we'd like quick access to
elseif string.lower(k) == "class" then
for class in string.gmatch(v, "%S+") do
table.insert(self.classes, class)
end
end
end
local function insert(table, name, node)
table[name] = table[name] or Set:new()
table[name]:add(node)
end
function ElementNode:close(closestart, closeend)
if closestart and closeend then
self._closestart, self._closeend = closestart, closeend
end
-- inform hihger level nodes about this element's existence in their branches
local node = self
while true do
node = node.parent
if not node then break end
node.deepernodes:add(self)
insert(node.deeperelements, self.name, self)
for k in pairs(self.attributes) do
insert(node.deeperattributes, k, self)
end
if self.id then
insert(node.deeperids, self.id, self)
end
for _,v in ipairs(self.classes) do
insert(node.deeperclasses, v, self)
end
end
end
local function escape(s)
-- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
end
local function select(self, s)
if not s or type(s) ~= "string" or s == "" then return Set:new() end
local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
["#"] = self.deeperids, ["."] = self.deeperclasses}
local function match(t, w)
local m, e, v
if t == "[" then w, m, e, v = string.match(w,
"([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
"([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
"(=?)" .. -- e = the optional "="
"(.*)" -- v = anything following the "=", or else ""
)
end
local matched = Set:new(sets[t][w])
-- attribute value selectors
if e == "=" then
if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
v = string.sub(v, 2, #v - 1) -- strip quotes
if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
for node in pairs(matched) do
local a = node.attributes[w]
-- equals
if m == "" and a ~= v then matched:remove(node)
-- not equals
elseif m == "!" and a == v then matched:remove(node)
-- prefix
elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
-- contains
elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
-- word
elseif m =="~" then matched:remove(node)
for word in string.gmatch(a, "%S+") do
if word == v then matched:add(node) break end
end
-- starts with
elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
-- ends with
elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
end
end -- for node
end -- if v
return matched
end
local subjects, resultset, childrenonly = Set:new({self})
for part in string.gmatch(s, "%S+") do
repeat
if part == ">" then childrenonly = true --[[goto nextpart]] break end
resultset = Set:new()
for subject in pairs(subjects) do
local star = subject.deepernodes
if childrenonly then star = Set:new(subject.nodes) end
resultset = resultset + star
end
childrenonly = false
if part == "*" then --[[goto nextpart]] break end
local excludes, filter = Set:new()
local start, pos = 0, 0
while true do
local switch, stype, name, eq, quote
start, pos, switch, stype, name, eq, quote = string.find(part,
"(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off
"([:%[#.]?)" .. -- stype = a possible :, [, #, or .
"([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore)
"([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or =
"(['\"]?)", -- quote = a ' or " delimiting a possible attribute value
pos + 1
)
if not name then break end
repeat
if ":" == stype then
filter = name
--[[goto nextname]] break
end
if ")" == switch then
filter = nil
end
if "[" == stype and "" ~= quote then
local value
start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos)
name = name .. eq .. value
end
local matched = match(stype, name)
if filter == "not" then
excludes = excludes + matched
else
resultset = resultset * matched
end
--::nextname::
break
until true
end
resultset = resultset - excludes
subjects = Set:new(resultset)
--::nextpart::
break
until true
end
resultset = resultset:tolist()
table.sort(resultset, function (a, b) return a.index < b.index end)
return resultset
end
function ElementNode:select(s) return select(self, s) end
ElementNode.mt.__call = select
return ElementNode

View File

@ -0,0 +1,19 @@
-- vim: ft=lua ts=2
return {
area = true,
base = true,
br = true,
col = true,
command = true,
embed = true,
hr = true,
img = true,
input = true,
keygen = true,
link = true,
meta = true,
param = true,
source = true,
track = true,
wbr = true
}

View File

@ -18,12 +18,14 @@ import hashlib
import shutil
import socket
import sys
import json
import time
project = Project(
title = "Geometry Dash",
title = "No",
project_html = """
<h2>Geometry Dash</h2>
<p>Time to archive Geometry Dash?</p>
<h2>Funeral homes</h2>
<p>Archiving funeral homes, because who else will?</p>
""",
)
@ -34,7 +36,7 @@ project = Project(
# It will be added to the WARC files and reported to the tracker.
VERSION = '20220428.01'
#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
TRACKER_ID = 'geometrytrash'
TRACKER_ID = 'funeralhomestest'
TRACKER_HOST = '172.17.0.1:8501'
WGET_AT = find_executable(
@ -118,13 +120,11 @@ def get_hash(filename):
CWD = os.getcwd()
PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))
LUA_SHA1 = get_hash(os.path.join(CWD, 'grab.lua'))
GMD_LUA_SHA1 = get_hash(os.path.join(CWD, 'gmd.lua'))
def stats_id_function(item):
d = {
'pipeline_hash': PIPELINE_SHA1,
'lua_hash': LUA_SHA1,
'gmd_lua_hash': GMD_LUA_SHA1,
'python_version': sys.version,
}
@ -135,18 +135,10 @@ class MoveFiles(SimpleTask):
SimpleTask.__init__(self, 'MoveFiles')
def process(self, item):
os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item,
'%(data_dir)s/%(warc_file_base)s.warc.gz' % item)
shutil.rmtree('%(item_dir)s' % item)
class AwfulBackfeed(SimpleTask):
def __init__(self):
SimpleTask.__init__(self, 'AwfulBackfeed')
def process(self, item):
with open('%(item_dir)s/new_items' % item) as file:
new_items = file.read()
item["ts"] = time.time()
item["dd"] = item["data_dir"].lstrip("grab/data/")
shutil.move('%(item_dir)s/' % item,
'/finished/%(dd)s_%(item_name)s_%(ts)s/' % item)
class WgetArgs(object):
def realize(self, item):
@ -162,18 +154,22 @@ class WgetArgs(object):
'--truncate-output',
'-e', 'robots=off',
'--rotate-dns',
'--page-requisites',
'--timeout', '10',
'--tries', '10',
'--span-hosts',
'--waitretry', '5000',
'--waitretry', '0',
'-w', '1',
'--random-wait',
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
'--warc-header', 'operator: TheTechRobo <thetechrobo@protonmail.ch>',
'--warc-header', json.dumps(stats_id_function(item)),
'--warc-header', 'x-wget-at-project-version: ' + VERSION,
'--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
'--warc-dedup-url-agnostic',
'--header', 'Contact: Discord TheTechRobo#7420',
'--header', 'Connection: keep-alive',
'--header', 'Accept-Language: en-US;q=0.9, en;q=0.8'
'-U', 'Mozilla/5.0 (Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0 ; Operator: TheTechRobo thetechrobo@protonmail.ch',
]
item['item_name_newline'] = item['item_name'].replace('\0', '\n')
@ -183,8 +179,13 @@ class WgetArgs(object):
for item_name in item['item_name'].split('\0'):
wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
wget_args.append('item-name://'+item_name)
item_urls.append(item_name)
wget_args.append(item_name)
i_n = item_name.split(':')
if i_n[0] == 'downsandson':
item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html']
if i_n[0] == 'tharpsontheimerfh':
item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b']
item_urls+=(item_name)
wget_args+=(item_name)
item['item_urls'] = item_urls
item['custom_items'] = json.dumps(custom_items)
@ -203,7 +204,7 @@ pipeline = Pipeline(
GetItemFromTracker('http://{}/{}'
.format(TRACKER_HOST, TRACKER_ID),
downloader, VERSION),
PrepareDirectories(warc_prefix='gmd'),
PrepareDirectories(warc_prefix='funeralhome'),
WgetDownload(
WgetArgs(),
max_tries=1,

106
table_show.lua Normal file
View File

@ -0,0 +1,106 @@
--[[
Author: Julio Manuel Fernandez-Diaz
Date: January 12, 2007
(For Lua 5.1)
Modified slightly by RiciLake to avoid the unnecessary table traversal in tablecount()
Formats tables with cycles recursively to any depth.
The output is returned as a string.
References to other tables are shown as values.
Self references are indicated.
The string returned is "Lua code", which can be procesed
(in the case in which indent is composed by spaces or "--").
Userdata and function keys and values are shown as strings,
which logically are exactly not equivalent to the original code.
This routine can serve for pretty formating tables with
proper indentations, apart from printing them:
print(table.show(t, "t")) -- a typical use
Heavily based on "Saving tables with cycles", PIL2, p. 113.
Arguments:
t is the table.
name is the name of the table (optional)
indent is a first indentation (optional).
--]]
function table.show(t, name, indent)
local cart -- a container
local autoref -- for self references
--[[ counts the number of elements in a table
local function tablecount(t)
local n = 0
for _, _ in pairs(t) do n = n+1 end
return n
end
]]
-- (RiciLake) returns true if the table is empty
local function isemptytable(t) return next(t) == nil end
local function basicSerialize (o)
local so = tostring(o)
if type(o) == "function" then
local info = debug.getinfo(o, "S")
-- info.name is nil because o is not a calling level
if info.what == "C" then
return string.format("%q", so .. ", C function")
else
-- the information is defined through lines
return string.format("%q", so .. ", defined in (" ..
info.linedefined .. "-" .. info.lastlinedefined ..
")" .. info.source)
end
elseif type(o) == "number" or type(o) == "boolean" then
return so
else
return string.format("%q", so)
end
end
local function addtocart (value, name, indent, saved, field)
indent = indent or ""
saved = saved or {}
field = field or name
cart = cart .. indent .. field
if type(value) ~= "table" then
cart = cart .. " = " .. basicSerialize(value) .. ";\n"
else
if saved[value] then
cart = cart .. " = {}; -- " .. saved[value]
.. " (self reference)\n"
autoref = autoref .. name .. " = " .. saved[value] .. ";\n"
else
saved[value] = name
--if tablecount(value) == 0 then
if isemptytable(value) then
cart = cart .. " = {};\n"
else
cart = cart .. " = {\n"
for k, v in pairs(value) do
k = basicSerialize(k)
local fname = string.format("%s[%s]", name, k)
field = string.format("[%s]", k)
-- three spaces between levels
addtocart(v, fname, indent .. " ", saved, field)
end
cart = cart .. indent .. "};\n"
end
end
end
end
name = name or "__unnamed__"
if type(t) ~= "table" then
return name .. " = " .. basicSerialize(t)
end
cart, autoref = "", ""
addtocart(t, name, indent)
return cart .. autoref
end