local htmlparser = require "htmlparser" require "table_show" function readAll(file) local f = assert(io.open(file, "rb")) local content = f:read("*all") f:close() return content end QUEUED_URLS = false function split(s, sep) local fields = {} local pattern = string.format("([^%s]+)", sep) string.gsub(s, pattern, function(c) fields[#fields + 1] = c end) return fields end function startswith(text, prefix) return text:find(prefix, 1, true) == 1 end local function contains(table, val) for i=1,#table do if table[i] == val then return true end end return false end wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) if contains(ign, urlpos.url.url) then print("Skipping already-retrieved URL " .. urlpos.url.url) return false end return verdict end wget.callbacks.httploop_result = function(url, err, http_stat) io.stderr:write(http_stat["statcode"] .. url["url"] .. "\n") if string.match(url.url, "/fetch%-comments%?") then return end if string.match(url.url, "/r$") then return end if http_stat.statcode == 200 then table.insert(ign, url.url) end end wget.callbacks.get_urls = function(file, url, is_css, iri) local addedUrls = {} local data = readAll(file) local item_name = os.getenv("item_name") if url:match("https://www%.strawpoll%.me/[^/]+/fetch%-comments%?page=.$") then page = page + 1 table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page }) table.insert(addedUrls, { url="https://www.strawpoll.me/" .. item_name .. "/fetch-comments?page=" .. page .. "&_=1653344833516"}) end io.stderr:write(table.show(addedUrls, "Added URLs ")) return addedUrls end wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time) local file = io.open(os.getenv("item_dir") .. "/alr_urls.txt", "w") io.output(file) io.write(table.concat(ign, "\n")) io.close(file) print("Wrote data.") end local _data = readAll(os.getenv("item_dir") .. "/alr_urls.txt") ign = split(_data, "\n") page = 1