diff --git a/Makefile b/Makefile index 800baae..fb15271 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ run: make clean docker build -t img . - docker run --rm img test + docker run -v "/media/thetechrobo/2tb/obitdata:/finished" --rm img TheTechRobo --concurrent 1 clean: rm -rf img diff --git a/gmd.lua b/gmd.lua deleted file mode 100644 index 45d909a..0000000 --- a/gmd.lua +++ /dev/null @@ -1,139 +0,0 @@ --- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua -function split(s, sep) - local fields = {} - - local sep = sep or " " - local pattern = string.format("([^%s]+)", sep) - string.gsub(s, pattern, function(c) fields[#fields + 1] = c end) - - return fields -end --- https://stackoverflow.com/questions/40149617/split-string-with-specified-delimiter-in-lua --- -GMD = {} -GMD["comments"] = {} - -function table.show(t, name, indent) - local cart -- a container - local autoref -- for self references - - --[[ counts the number of elements in a table - local function tablecount(t) - local n = 0 - for _, _ in pairs(t) do n = n+1 end - return n - end - ]] - -- (RiciLake) returns true if the table is empty - local function isemptytable(t) return next(t) == nil end - - local function basicSerialize (o) - local so = tostring(o) - if type(o) == "function" then - local info = debug.getinfo(o, "S") - -- info.name is nil because o is not a calling level - if info.what == "C" then - return string.format("%q", so .. ", C function") - else - -- the information is defined through lines - return string.format("%q", so .. ", defined in (" .. - info.linedefined .. "-" .. info.lastlinedefined .. - ")" .. info.source) - end - elseif type(o) == "number" or type(o) == "boolean" then - return so - else - return string.format("%q", so) - end - end - - local function addtocart (value, name, indent, saved, field) - indent = indent or "" - saved = saved or {} - field = field or name - - cart = cart .. indent .. field - - if type(value) ~= "table" then - cart = cart .. " = " .. basicSerialize(value) .. ";\n" - else - if saved[value] then - cart = cart .. " = {}; -- " .. saved[value] - .. " (self reference)\n" - autoref = autoref .. name .. " = " .. saved[value] .. ";\n" - else - saved[value] = name - --if tablecount(value) == 0 then - if isemptytable(value) then - cart = cart .. " = {};\n" - else - cart = cart .. " = {\n" - for k, v in pairs(value) do - k = basicSerialize(k) - local fname = string.format("%s[%s]", name, k) - field = string.format("[%s]", k) - -- three spaces between levels - addtocart(v, fname, indent .. " ", saved, field) - end - cart = cart .. indent .. "};\n" - end - end - end - end - - name = name or "__unnamed__" - if type(t) ~= "table" then - return name .. " = " .. basicSerialize(t) - end - cart, autoref = "", "" - addtocart(t, name, indent) - return cart .. autoref -end - -GMD.comments.mapping = {"levelID","comment","authorPlayerID","likes","dislikes","messageID","spam","authorAccountID","age","percent","modBadge","moderatorChatColor"} -- https://docs.gdprogra.me/#/resources/server/comment - -GMD["comments"]["parse"] = function(comments) - local comment = comments - local splitted = split(comment, ":") - if not splitted[2] then - return false - end - local retern = {} - retern.comment = splitted[1] - retern.account = splitted[2] - - retern.parsed = {} - retern.parsed.comment = {} - local data = split(retern.comment, "|") - for i=1, #data do - retern.parsed.comment[i] = {} - -- comment parser - local ndata = split(data[i], "~") - for j=1, #ndata do - if not (j % 2 == 0) then -- key - key = ndata[j] - key = GMD.comments.mapping[tonumber(key)] - else -- value - local value = ndata[j] - retern.parsed.comment[i][key] = value - end - end - end - return retern -end -GMD["comments"]["getOneComment"] = function (self, comments, pos) - local parsed = self.parse(comments) - if not parsed then - return false - else - return parsed.parsed.comment[pos or 1] - end -end - - -function GMDtest() - local strin = "2~NzUwMCBzdGFycyBjOg==~4~3~9~1 month~6~1803945|2~SSBiZWF0IDYgaW5zYW5lIGRlbW9ucyBpbiAyNCBob3VycyBsbWFvOiBOZWNyb3BvbGlzLCBUaGUgQ2F2ZXJucyBJSSwgRWxlbWVudHMgWCwgWCBBZHZlbnR1cmUsIFNhZGlzbSwgYW5kIEJsYXN0ZXIgYzo=~4~21~9~8 months~6~1793260|2~L1wvXC9cIDwz~4~6~9~1 year~6~1785414|2~U2VudCBmcm9tIGlPUyBTaG9ydGN1dHMh~4~8~9~1 year~6~1776426|2~VGhpcyBjb21tZW50IHdhcyB1cGxvYWRlZCBmb3IgdGhlIEdEIERvY3Mh~4~5~9~1 year~6~1772719|2~VGhlIHRyaWxvZ3kgaGFzIGJlZW4gY29tcGxldGVkLi4uR0cgQWZ0ZXJtYXRoIQ==~4~8~9~1 year~6~1766450|2~Im93byIgLSBGb3VuZG15YmFsbA==~4~4~9~1 year~6~1766338|2~NTAwMCBzdGFycyE=~4~12~9~2 years~6~1756926|2~Qmxvb2RiYXRoIEdHISEh~4~24~9~2 years~6~1745624|2~QWxsZWdpYW5jZSAxMDAl~4~3~9~2 years~6~1744292#73:0:10" - assert(GMD["comments"]:getOneComment(strin)["comment"] == "NzUwMCBzdGFycyBjOg==") - assert(not GMD["comments"]:getOneComment("-1")) -end -GMDtest() diff --git a/grab.lua b/grab.lua index 2ca7223..e4d5b33 100644 --- a/grab.lua +++ b/grab.lua @@ -1,6 +1,5 @@ -require "gmd" - -NEW_ITEMS = {} +local htmlparser = require "htmlparser" +require "table_show" function readAll(file) local f = assert(io.open(file, "rb")) @@ -9,19 +8,54 @@ function readAll(file) return content end --- print(table.show(false)) -wget.callbacks.httploop_result = function(url, err, http_stat) - local data = readAll(http_stat.local_file) - -- Time to make sure that it's a valid response. - local result = GMD.comments.parse(data) - if result then - return wget.actions.NOTHING - else - io.stderr:write("\aYou've been IP-banned from Geometry Dash's servers. Sorry about that.\n") - io.stderr:write("Please let us know in #geometrytrash on hackint!\n") - io.stderr:write("Sleeping 69 seconds. (nice)\n") - os.execute("sleep 69") - return wget.actions.ABORT -- We've been banned - end +QUEUED_URLS = false + +function startswith(text, prefix) + return text:find(prefix, 1, true) == 1 end +wget.callbacks.httploop_result = function(url, err, http_stat) + io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n") +end + +wget.callbacks.get_urls = function(file, url, is_css, iri) + local addedUrls = {} + local data = readAll(file) + io.stderr:write("Read data\n") + if url:match("https://downsandsonfuneralhome%.com/tribute/details/[^/]+/Dr%-Alex%-Klym/obituary%.html") then + local root = htmlparser.parse(data) + io.stderr:write("Read root\n") + local dataa ={} + dataa.obit = root("#obituary-link-list-item a") + dataa.cond = root("#condolences-link-list-item a") + dataa.serv = root("#service-link-list-item a") + dataa.mems = root("#memories-link-list-item a") + dataa.char = root("#charities-link-list-item a") + dataa.prin = root(".print-obit-btn a") + assert(not dataa.cond[2]) + assert(not dataa.serv[2]) + assert(not dataa.mems[2]) + assert(not dataa.char[2]) + assert(not dataa.prin[2]) + assert(not dataa.obit[2]) -- make sure that there's only one element that fits the criteria + table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.obit[1].attributes.href}) + table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.cond[1].attributes.href}) + table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.serv[1].attributes.href}) + table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.char[1].attributes.href}) + if dataa.prin[1] then + table.insert(addedUrls, { url="https://downsandsonfuneralhome.com" .. dataa.prin[1].attributes.href}) + end + QUEUED_URLS = true + end + if startswith(url, "https://www.tharpsontheimerfh.com/tributes/") then + local ok=os.getenv("item_name") + local root=htmlparser.parse(data) + local div =root("#obitsbarV31") + assert(not div[2]) + local oid=div[1].attributes["data-oid"] + table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/obpgsnvn", post_data="sn=tributewall&oid=" .. oid}) + table.insert(addedUrls, { url="https://www.tharpsontheimerfh.com/pax/prnobit", post_data="ok=" .. ok .. "&fcf=0&bg=1"}) + end + io.stderr:write(table.show(addedUrls, "Added URLs ")) + return addedUrls +end diff --git a/htmlparser.lua b/htmlparser.lua new file mode 100644 index 0000000..c6226be --- /dev/null +++ b/htmlparser.lua @@ -0,0 +1,256 @@ +-- vim: ft=lua ts=2 sw=2 + +-- Syntactic Sugar {{{ +local function rine(val) -- Return (val) If it's Not Empty (non-zero-length) + return (val and #val>0) and val +end +local function rit(a) -- Return (a) If it's Table + return (type(a) == "table") and a +end +local noop = function() end +local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end +local str = tostring +local char = string.char +local opts = rit(htmlparser_opts) or {} -- needed for silent/noerr/noout/nonl directives, also needed to be defined before `require` in such case +local prn = opts.silent and noop or function(l,f,...) + local fd = (l=="i") and "stdout" or "stderr" + local t = (" [%s] "):format(l:upper()) + io[fd] + :write('[HTMLParser]'..t..f:format(...) + ..(opts.nonl or "\n") + ) +end +local err = opts.noerr and noop or function(f,...) prn("e",f,...) end +local out = opts.noout and noop or function(f,...) prn("i",f,...) end +local line = debug and function(lvl) return debug.getinfo(lvl or 2).currentline end or noop +local dbg = opts.debug and function(f,...) prn("d",f:gsub("#LINE#",str(line(3))),...) end or noop +-- }}} +-- Requires {{{ +local ElementNode = require"htmlparser.ElementNode" +local voidelements = require"htmlparser.voidelements" +--}}} +local HtmlParser = {} +local function parse(text,limit) -- {{{ + local opts = rine(opts) -- use top-level opts-table (the one, defined before requiring the module), if exists + or rit(htmlparser_opts) -- or defined after requiring (but before calling `parse`) + or {} -- fallback otherwise + opts.looplimit = opts.looplimit or htmlparser_looplimit + + local text = str(text) + local limit = limit or opts.looplimit or 1000 + local tpl = false + + if not opts.keep_comments then -- Strip (or not) comments {{{ + text = text:gsub("","") -- Many chances commented code will have syntax errors, that'll lead to parser failures + end -- }}} + + local tpr={} + + if not opts.keep_danger_placeholders then -- {{{ little speedup by cost of potential parsing breakages + -- search unused "invalid" bytes {{{ + local busy,i={},0; + repeat -- {{{ + local cc = char(i) + if not(text:match(cc)) then -- {{{ + if not(tpr["<"]) or not(tpr[">"]) then -- {{{ + if not(busy[i]) then -- {{{ + if not(tpr["<"]) then -- {{{ + tpr["<"] = cc; + elseif not(tpr[">"]) then + tpr[">"] = cc; + end -- }}} + busy[i] = true + dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",str(c),cc:byte(),str(tpr[c])) + dbg("busy[i]:{%s},i:{%d}",str(busy[i]),i) + dbg("[FindPH]:#LINE# Success! || i=%d",i) + else -- if !busy + dbg("[FindPH]:#LINE# Busy! || i=%d",i) + end -- if !busy -- }}} + dbg("c:{%s}||cc:{%d}||tpr[c]:{%s}",c,cc:byte(),str(tpr[c])) + dbg("%s",str(busy[i])) + else -- if < or > + dbg("[FindPH]:#LINE# Done!",i) + break + end -- if < or > -- }}} + else -- text!match(cc) + dbg("[FindPH]:#LINE# Text contains this byte! || i=%d",i) + end -- text!match(cc) -- }}} + local skip=1 + if i==31 then + skip=96 -- ASCII + end + i=i+skip + until (i==255) -- }}} + i=nil + --- }}} + + if not(tpr["<"]) or not(tpr[">"]) then + err("Impossible to find at least two unused byte codes in this HTML-code. We need it to escape bracket-contained placeholders inside tags.") + err("Consider enabling 'keep_danger_placeholders' option (to silence this error, if parser wasn't failed with current HTML-code) or manually replace few random bytes, to free up the codes.") + else + dbg("[FindPH]:#LINE# Found! || '<'=%d, '>'=%d",tpr["<"]:byte(),tpr[">"]:byte()) + end + +-- dbg("tpr[>] || tpr[] || #busy%d") + + -- g {{{ + local function g(id,...) + local arg={...} + local orig=arg[id] + arg[id]=arg[id]:gsub("(.)",tpr) + if arg[id] ~= orig then + tpl=true + dbg("[g]:#LINE# orig: %s", str(orig)) + dbg("[g]:#LINE# replaced: %s",str(arg[id])) + end + dbg("[g]:#LINE# called, id: %s, arg[id]: %s, args { "..(("{%s}, "):rep(#arg):gsub(", $","")).." }",id,arg[id],...) + dbg("[g]:#LINE# concat(arg): %s",table.concat(arg)) + return table.concat(arg) + end + -- g }}} + + -- tpl-placeholders and attributes {{{ + text=text + :gsub( + "(=[%s]-)".. -- only match attr.values, and not random strings between two random apostrophs + "(%b'')", + function(...)return g(2,...)end + ) + :gsub( + "(=[%s]-)".. -- same for " + '(%b"")', + function(...)return g(2,...)end + ) -- Escape "<"/">" inside attr.values (see issue #50) + :gsub( + "(<".. -- Match "<", + (opts.tpl_skip_pattern or "[^!]").. -- with exclusion pattern (for example, to ignore comments, which aren't template placeholders, but can legally contain "<"/">" inside. + ")([^>]+)".. -- If matched, we want to escape '<'s if we meet them inside tag + "(>)", + function(...)return g(2,...)end + ) + :gsub( + "(".. + (tpr["<"] or "__FAILED__").. -- Here we search for "<", we escaped in previous gsub (and don't break things if we have no escaping replacement) + ")(".. + (opts.tpl_marker_pattern or "[^%w%s]").. -- Capture templating symbol + ")([%g%s]-)".. -- match placeholder's content + "(%2)(>)".. -- placeholder's tail + "([^>]*>)", -- remainings + function(...)return g(5,...)end + ) + -- }}} + end -- }}} + + local index = 0 + local root = ElementNode:new(index, str(text)) + local node, descend, tpos, opentags = root, true, 1, {} + + while true do -- MainLoop {{{ + if index == limit then -- {{{ + err("Main loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end -- }}} + -- openstart/tpos Definitions {{{ + local openstart, name + openstart, tpos, name = root._text:find( + "<" .. -- an uncaptured starting "<" + "([%w-]+)" .. -- name = the first word, directly following the "<" + "[^>]*>", -- include, but not capture everything up to the next ">" + tpos) + dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name)) + -- }}} + if not name then break end + -- Some more vars {{{ + index = index + 1 + local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos) + node = tag + local tagloop + local tagst, apos = tag:gettext(), 1 + -- }}} + while true do -- TagLoop {{{ + dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop)) + if tagloop == limit then -- {{{ + err("Tag parsing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end -- }}} + -- Attrs {{{ + local start, k, eq, quote, v, zsp + start, apos, k, zsp, eq, zsp, quote = tagst:find( + "%s+" .. -- some uncaptured space + "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">" + "([%s]-)".. -- zero or more spaces + "(=?)" .. -- eq = the optional; "=", else "" + "([%s]-)".. -- zero or more spaces + [=[(['"]?)]=], -- quote = an optional "'" or '"' following the "=", or "" + apos) + dbg("[TagLoop]:#LINE# start=%s || apos=%s || k=%s || zsp='%s' || eq='%s', quote=[%s]",str(start),str(apos),str(k),str(zsp),str(eq),str(quote)) + -- }}} + if not k or k == "/>" or k == ">" then break end + -- Pattern {{{ + if eq == "=" then + local pattern = "=([^%s>]*)" + if quote ~= "" then + pattern = quote .. "([^" .. quote .. "]*)" .. quote + end + start, apos, v = tagst:find(pattern, apos) + dbg("[TagLoop]:#LINE# start=%s || apos=%s || v=%s || pattern=%s",str(start),str(apos),str(v),str(pattern)) + end + -- }}} + v=v or "" + if tpl then -- {{{ + for rk,rv in pairs(tpr) do + v = v:gsub(rv,rk) + dbg("[TagLoop]:#LINE# rv=%s || rk=%s",str(rv),str(rk)) + end + end -- }}} + + dbg("[TagLoop]:#LINE# k=%s || v=%s",str(k),str(v)) + tag:addattribute(k, v) + tagloop = (tagloop or 0) + 1 + end + -- }}} + if voidelements[tag.name:lower()] then -- {{{ + descend = false + tag:close() + else + descend = true + opentags[tag.name] = opentags[tag.name] or {} + table.insert(opentags[tag.name], tag) + end + -- }}} + local closeend = tpos + local closingloop + while true do -- TagCloseLoop {{{ + -- Can't remember why did I add that, so comment it for now (and not remove), in case it will be needed again + -- (although, it causes #59 and #60, so it will anyway be needed to rework) + -- if voidelements[tag.name:lower()] then break end -- already closed + if closingloop == limit then + err("Tag closing loop reached loop limit (%d). Consider either increasing it or checking HTML-code for syntax errors", limit) + break + end + + local closestart, closing, closename + closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend) + dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename)) + + if not closing or closing == "" then break end + + tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags + closestart = root._text:find("<", closestart) + dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart)) + tag:close(closestart, closeend + 1) + node = tag.parent + descend = true + closingloop = (closingloop or 0) + 1 + end -- }}} + end -- }}} + if tpl then -- {{{ + dbg("tpl") + for k,v in pairs(tpr) do + root._text = root._text:gsub(v,k) + end + end -- }}} + return root +end -- }}} +HtmlParser.parse = parse +return HtmlParser diff --git a/htmlparser/ElementNode.lua b/htmlparser/ElementNode.lua new file mode 100644 index 0000000..0c39901 --- /dev/null +++ b/htmlparser/ElementNode.lua @@ -0,0 +1,283 @@ +-- vim: ft=lua ts=2 +local Set = {} +Set.mt = {__index = Set} +function Set:new(values) + local instance = {} + local isSet if getmetatable(values) == Set.mt then isSet = true end + if type(values) == "table" then + if not isSet and #values > 0 then + for _,v in ipairs(values) do + instance[v] = true + end + else + for k in pairs(values) do + instance[k] = true + end + end + elseif values ~= nil then + instance = {[values] = true} + end + return setmetatable(instance, Set.mt) +end + +function Set:add(e) + if e ~= nil then self[e] = true end + return self +end + +function Set:remove(e) + if e ~= nil then self[e] = nil end + return self +end + +function Set:tolist() + local res = {} + for k in pairs(self) do + table.insert(res, k) + end + return res +end + +Set.mt.__add = function (a, b) + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = true end + return res +end + +-- Subtraction +Set.mt.__sub = function (a, b) + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do res[k] = true end + for k in pairs(b) do res[k] = nil end + return res +end + +-- Intersection +Set.mt.__mul = function (a, b) + local res, a, b = Set:new(), Set:new(a), Set:new(b) + for k in pairs(a) do + res[k] = b[k] + end + return res +end + +-- String representation +Set.mt.__tostring = function (set) + local s = "{" + local sep = "" + for k in pairs(set) do + s = s .. sep .. tostring(k) + sep = ", " + end + return s .. "}" +end + + +local ElementNode = {} +ElementNode.mt = {__index = ElementNode} +function ElementNode:new(index, nameortext, node, descend, openstart, openend) + local instance = { + index = index, + name = nameortext, + level = 0, + parent = nil, + root = nil, + nodes = {}, + _openstart = openstart, _openend = openend, + _closestart = openstart, _closeend = openend, + attributes = {}, + id = nil, + classes = {}, + deepernodes = Set:new(), + deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {} + } + if not node then + instance.name = "root" + instance.root = instance + instance._text = nameortext + local length = string.len(nameortext) + instance._openstart, instance._openend = 1, length + instance._closestart, instance._closeend = 1, length + elseif descend then + instance.root = node.root + instance.parent = node + instance.level = node.level + 1 + table.insert(node.nodes, instance) + else + instance.root = node.root + instance.parent = node.parent or node --XXX: adds some safety but needs more testing for heisenbugs in corner cases + instance.level = node.level + table.insert((node.parent and node.parent.nodes or node.nodes), instance) --XXX: see above about heisenbugs + end + return setmetatable(instance, ElementNode.mt) +end + +function ElementNode:gettext() + return string.sub(self.root._text, self._openstart, self._closeend) +end + +function ElementNode:settext(c) + self.root._text=c +end + +function ElementNode:textonly() + return (self:gettext():gsub("<[^>]*>","")) +end + +function ElementNode:getcontent() + return string.sub(self.root._text, self._openend + 1, self._closestart - 1) +end + +function ElementNode:addattribute(k, v) + self.attributes[k] = v + if string.lower(k) == "id" then + self.id = v + -- class attribute contains "space-separated tokens", each of which we'd like quick access to + elseif string.lower(k) == "class" then + for class in string.gmatch(v, "%S+") do + table.insert(self.classes, class) + end + end +end + +local function insert(table, name, node) + table[name] = table[name] or Set:new() + table[name]:add(node) +end + +function ElementNode:close(closestart, closeend) + if closestart and closeend then + self._closestart, self._closeend = closestart, closeend + end + -- inform hihger level nodes about this element's existence in their branches + local node = self + while true do + node = node.parent + if not node then break end + node.deepernodes:add(self) + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) + end + end +end + +local function escape(s) + -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix + return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") +end + +local function select(self, s) + if not s or type(s) ~= "string" or s == "" then return Set:new() end + local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes, + ["#"] = self.deeperids, ["."] = self.deeperclasses} + local function match(t, w) + local m, e, v + if t == "[" then w, m, e, v = string.match(w, + "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^" + "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "=" + "(=?)" .. -- e = the optional "=" + "(.*)" -- v = anything following the "=", or else "" + ) + end + local matched = Set:new(sets[t][w]) + -- attribute value selectors + if e == "=" then + if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted + v = string.sub(v, 2, #v - 1) -- strip quotes + if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute + for node in pairs(matched) do + local a = node.attributes[w] + -- equals + if m == "" and a ~= v then matched:remove(node) + -- not equals + elseif m == "!" and a == v then matched:remove(node) + -- prefix + elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node) + -- contains + elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node) + -- word + elseif m =="~" then matched:remove(node) + for word in string.gmatch(a, "%S+") do + if word == v then matched:add(node) break end + end + -- starts with + elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node) + -- ends with + elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node) + end + end -- for node + end -- if v + return matched + end + + local subjects, resultset, childrenonly = Set:new({self}) + for part in string.gmatch(s, "%S+") do + repeat + if part == ">" then childrenonly = true --[[goto nextpart]] break end + resultset = Set:new() + for subject in pairs(subjects) do + local star = subject.deepernodes + if childrenonly then star = Set:new(subject.nodes) end + resultset = resultset + star + end + childrenonly = false + if part == "*" then --[[goto nextpart]] break end + local excludes, filter = Set:new() + local start, pos = 0, 0 + while true do + local switch, stype, name, eq, quote + start, pos, switch, stype, name, eq, quote = string.find(part, + "(%(?%)?)" .. -- switch = a possible ( or ) switching the filter on or off + "([:%[#.]?)" .. -- stype = a possible :, [, #, or . + "([%w-_\\]+)" .. -- name = 1 or more alfanumeric chars (+ hyphen, reverse slash and uderscore) + "([|%*~%$!%^]?=?)" .. -- eq = a possible |=, *=, ~=, $=, !=, ^=, or = + "(['\"]?)", -- quote = a ' or " delimiting a possible attribute value + pos + 1 + ) + if not name then break end + repeat + if ":" == stype then + filter = name + --[[goto nextname]] break + end + if ")" == switch then + filter = nil + end + if "[" == stype and "" ~= quote then + local value + start, pos, value = string.find(part, "(%b" .. quote .. quote .. ")]", pos) + name = name .. eq .. value + end + local matched = match(stype, name) + if filter == "not" then + excludes = excludes + matched + else + resultset = resultset * matched + end + --::nextname:: + break + until true + end + resultset = resultset - excludes + subjects = Set:new(resultset) + --::nextpart:: +break +until true + end + resultset = resultset:tolist() + table.sort(resultset, function (a, b) return a.index < b.index end) + return resultset +end + +function ElementNode:select(s) return select(self, s) end +ElementNode.mt.__call = select + +return ElementNode diff --git a/htmlparser/voidelements.lua b/htmlparser/voidelements.lua new file mode 100644 index 0000000..43dedf5 --- /dev/null +++ b/htmlparser/voidelements.lua @@ -0,0 +1,19 @@ +-- vim: ft=lua ts=2 +return { + area = true, + base = true, + br = true, + col = true, + command = true, + embed = true, + hr = true, + img = true, + input = true, + keygen = true, + link = true, + meta = true, + param = true, + source = true, + track = true, + wbr = true +} diff --git a/pipeline.py b/pipeline.py index 4c6b0b3..adc49ca 100644 --- a/pipeline.py +++ b/pipeline.py @@ -18,12 +18,14 @@ import hashlib import shutil import socket import sys +import json +import time project = Project( - title = "Geometry Dash", + title = "No", project_html = """ -

Geometry Dash

-

Time to archive Geometry Dash?

+

Funeral homes

+

Archiving funeral homes, because who else will?

""", ) @@ -34,7 +36,7 @@ project = Project( # It will be added to the WARC files and reported to the tracker. VERSION = '20220428.01' #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' -TRACKER_ID = 'geometrytrash' +TRACKER_ID = 'funeralhomestest' TRACKER_HOST = '172.17.0.1:8501' WGET_AT = find_executable( @@ -118,13 +120,11 @@ def get_hash(filename): CWD = os.getcwd() PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py')) LUA_SHA1 = get_hash(os.path.join(CWD, 'grab.lua')) -GMD_LUA_SHA1 = get_hash(os.path.join(CWD, 'gmd.lua')) def stats_id_function(item): d = { 'pipeline_hash': PIPELINE_SHA1, 'lua_hash': LUA_SHA1, - 'gmd_lua_hash': GMD_LUA_SHA1, 'python_version': sys.version, } @@ -135,18 +135,10 @@ class MoveFiles(SimpleTask): SimpleTask.__init__(self, 'MoveFiles') def process(self, item): - os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, - '%(data_dir)s/%(warc_file_base)s.warc.gz' % item) - - shutil.rmtree('%(item_dir)s' % item) - -class AwfulBackfeed(SimpleTask): - def __init__(self): - SimpleTask.__init__(self, 'AwfulBackfeed') - - def process(self, item): - with open('%(item_dir)s/new_items' % item) as file: - new_items = file.read() + item["ts"] = time.time() + item["dd"] = item["data_dir"].lstrip("grab/data/") + shutil.move('%(item_dir)s/' % item, + '/finished/%(dd)s_%(item_name)s_%(ts)s/' % item) class WgetArgs(object): def realize(self, item): @@ -162,18 +154,22 @@ class WgetArgs(object): '--truncate-output', '-e', 'robots=off', '--rotate-dns', + '--page-requisites', '--timeout', '10', '--tries', '10', '--span-hosts', - '--waitretry', '5000', + '--waitretry', '0', + '-w', '1', + '--random-wait', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: TheTechRobo ', + '--warc-header', json.dumps(stats_id_function(item)), '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--header', 'Contact: Discord TheTechRobo#7420', '--header', 'Connection: keep-alive', - '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8' + '-U', 'Mozilla/5.0 (Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0 ; Operator: TheTechRobo thetechrobo@protonmail.ch', ] item['item_name_newline'] = item['item_name'].replace('\0', '\n') @@ -183,8 +179,13 @@ class WgetArgs(object): for item_name in item['item_name'].split('\0'): wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) - item_urls.append(item_name) - wget_args.append(item_name) + i_n = item_name.split(':') + if i_n[0] == 'downsandson': + item_name = [f'https://downsandsonfuneralhome.com/tribute/details/{i_n[1]}/Dr-Alex-Klym/obituary.html'] + if i_n[0] == 'tharpsontheimerfh': + item_name = [f'https://www.tharpsontheimerfh.com/tributes/{i_n[1]}', f'https://www.tharpsontheimerfh.com/printnotice/{i_n[1]}/1o/1c/1q/0d/1b'] + item_urls+=(item_name) + wget_args+=(item_name) item['item_urls'] = item_urls item['custom_items'] = json.dumps(custom_items) @@ -203,7 +204,7 @@ pipeline = Pipeline( GetItemFromTracker('http://{}/{}' .format(TRACKER_HOST, TRACKER_ID), downloader, VERSION), - PrepareDirectories(warc_prefix='gmd'), + PrepareDirectories(warc_prefix='funeralhome'), WgetDownload( WgetArgs(), max_tries=1, diff --git a/table_show.lua b/table_show.lua new file mode 100644 index 0000000..67a951a --- /dev/null +++ b/table_show.lua @@ -0,0 +1,106 @@ +--[[ + Author: Julio Manuel Fernandez-Diaz + Date: January 12, 2007 + (For Lua 5.1) + + Modified slightly by RiciLake to avoid the unnecessary table traversal in tablecount() + + Formats tables with cycles recursively to any depth. + The output is returned as a string. + References to other tables are shown as values. + Self references are indicated. + + The string returned is "Lua code", which can be procesed + (in the case in which indent is composed by spaces or "--"). + Userdata and function keys and values are shown as strings, + which logically are exactly not equivalent to the original code. + + This routine can serve for pretty formating tables with + proper indentations, apart from printing them: + + print(table.show(t, "t")) -- a typical use + + Heavily based on "Saving tables with cycles", PIL2, p. 113. + + Arguments: + t is the table. + name is the name of the table (optional) + indent is a first indentation (optional). +--]] +function table.show(t, name, indent) + local cart -- a container + local autoref -- for self references + + --[[ counts the number of elements in a table + local function tablecount(t) + local n = 0 + for _, _ in pairs(t) do n = n+1 end + return n + end + ]] + -- (RiciLake) returns true if the table is empty + local function isemptytable(t) return next(t) == nil end + + local function basicSerialize (o) + local so = tostring(o) + if type(o) == "function" then + local info = debug.getinfo(o, "S") + -- info.name is nil because o is not a calling level + if info.what == "C" then + return string.format("%q", so .. ", C function") + else + -- the information is defined through lines + return string.format("%q", so .. ", defined in (" .. + info.linedefined .. "-" .. info.lastlinedefined .. + ")" .. info.source) + end + elseif type(o) == "number" or type(o) == "boolean" then + return so + else + return string.format("%q", so) + end + end + + local function addtocart (value, name, indent, saved, field) + indent = indent or "" + saved = saved or {} + field = field or name + + cart = cart .. indent .. field + + if type(value) ~= "table" then + cart = cart .. " = " .. basicSerialize(value) .. ";\n" + else + if saved[value] then + cart = cart .. " = {}; -- " .. saved[value] + .. " (self reference)\n" + autoref = autoref .. name .. " = " .. saved[value] .. ";\n" + else + saved[value] = name + --if tablecount(value) == 0 then + if isemptytable(value) then + cart = cart .. " = {};\n" + else + cart = cart .. " = {\n" + for k, v in pairs(value) do + k = basicSerialize(k) + local fname = string.format("%s[%s]", name, k) + field = string.format("[%s]", k) + -- three spaces between levels + addtocart(v, fname, indent .. " ", saved, field) + end + cart = cart .. indent .. "};\n" + end + end + end + end + + name = name or "__unnamed__" + if type(t) ~= "table" then + return name .. " = " .. basicSerialize(t) + end + cart, autoref = "", "" + addtocart(t, name, indent) + return cart .. autoref +end +