strawpoll-grab/grab.lua

79 lines
2.1 KiB
Lua

local htmlparser = require "htmlparser"
require "table_show"
function readAll(file)
local f = assert(io.open(file, "rb"))
local content = f:read("*all")
f:close()
return content
end
QUEUED_URLS = false
function split(s, sep)
local fields = {}
local pattern = string.format("([^%s]+)", sep)
string.gsub(s, pattern, function(c) fields[#fields + 1] = c end)
return fields
end
function startswith(text, prefix)
return text:find(prefix, 1, true) == 1
end
local function contains(table, val)
for i=1,#table do
if table[i] == val then
return true
end
end
return false
end
wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
if contains(ign, urlpos.url.url) then
print("Skipping already-retrieved URL " .. urlpos.url.url)
return false
end
return verdict
end
wget.callbacks.httploop_result = function(url, err, http_stat)
io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n")
if string.match(url.url, "/fetch%-comments%?") then
return
end
if string.match(url.url, "/r$") then
return
end
if http_stat.statcode == 200 then
table.insert(ign, url.url)
end
end
wget.callbacks.get_urls = function(file, url, is_css, iri)
local addedUrls = {}
local data = readAll(file)
local item_name = os.getenv("item_name")
if url:match("https://www%.strawpoll%.me/[^/]+/fetch%-comments%?page=.$") then
page = page + 1
table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page })
table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page .. "&_=1653344833516"})
end
io.stderr:write(table.show(addedUrls, "Added URLs "))
return addedUrls
end
wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time)
local file = io.open(os.getenv("item_dir") .. "/alr_urls.txt", "w")
io.output(file)
io.write(table.concat(ign, "\n"))
io.close(file)
print("Wrote data.")
end
local _data = readAll(os.getenv("item_dir") .. "/alr_urls.txt")
ign = split(_data, "\n")
page = 1