diff --git a/.gitignore b/.gitignore index c18dd8d..7d20b26 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ __pycache__/ +warcs/ +*.pem/ +*-warcprox-ca/ +warcprox* diff --git a/README.md b/README.md new file mode 100644 index 0000000..c93f659 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# Bilibili scraper +This was made at someone else's request. + +## Limitations +- Cannot scrape video +- Cannot scrape audio + +## Requirements for wrapper.py +- Python 3.7 (tested on 3.9) +- alive_progress (installable via pip) +- requests (installable via pip) +- cprint (installable via pip) + +## Requirements for the module +- requests (installable via pip) + +## To use +Recommeded to use as a module rather than wrapper.py. `wrapper.py` is an example, but should work fine to scrape a user. + +--- +IMPORTANT +--- + +This comes with ABSOLUTELY NO WARRANTY, to the extent permitted by applicable law. +Licenced under the Apache-2.0 licence. Copyright (c) 2022 TheTechRobo. diff --git a/bilibili.py b/bilibili.py index bbf13bb..2161b71 100644 --- a/bilibili.py +++ b/bilibili.py @@ -14,6 +14,27 @@ def userScraper(id): for url, explanation in urls: yield requests.get(url).json(), explanation +def articleScraper(id): + urls = [] + page = 1 + size = 12 + urls.append((f"https://api.bilibili.com/x/web-interface/card?mid={id}&article=true", "PROFILE_ARTICLE_CARD")) + maxpage = requests.get(f"https://api.bilibili.com/x/space/navnum?mid={id}").json()["data"]["article"] / size + while page <= maxpage: + for sort_type in ["view", "publish_time", "fav"]: + urls.append((f"https://api.bilibili.com/x/space/article?mid=233193626&pn={page}&ps={size}&sort={sort_type}&jsonp=jsonp", f"MASTER_LIST_{sort_type}")) + page += 1 + urls.append((f"https://api.bilibili.com/x/article/up/lists?mid={id}&sort=0&jsonp=jsonp", "LISTS")) + for url, expl in urls: + yield requests.get(url).json(), expl + +def individualArticleScraper(id): + urls = [] + urls.append((f"https://api.bilibili.com/x/article/viewinfo?id={id}&mobi_app=pc&from=web", "VIEWINFO")) + urls.append((f"https://www.bilibili.com/read/cv{id}", "readCV")) + for url, expl in urls: + yield requests.get(url).json(), expl + def albumScraper(id): page = 0 size = 30 diff --git a/wrapper.py b/wrapper.py index 90ca14e..5a3e85f 100644 --- a/wrapper.py +++ b/wrapper.py @@ -1,4 +1,5 @@ import bilibili, json, requests +from alive_progress import alive_bar from cprint import cprint cprint.warn("PLEASE NOTE:\n\tThis is meant to be used with Warcprox. **NO DATA IS SAVED ANYWHERE.**") @@ -10,16 +11,18 @@ PROFILE = input("Please enter the profile ID: ") list(bilibili.userScraper(PROFILE)) # get metadata -for images, _ in bilibili.albumScraper(PROFILE): - for image in images["data"]["items"]: - QUEUED.append((image["dyn_id"], "IMAGE_POST")) +cprint.info("Scraping album pagination for URLs...") -progress = 0 +with alive_bar() as bar: + for images, _ in bilibili.albumScraper(PROFILE): + for image in images["data"]["items"]: + QUEUED_IMAGES.append((image["dyn_id"], "IMAGE_POST")) + bar() -for item, typee in QUEUED: - if typee == "IMAGE_POST": - if progress % 10 == 0: - cprint.ok(f"Scraping, {progress} results so far") +cprint.info("Now downloading image metadata...") + +with alive_bar(total=len(QUEUED_IMAGES)) as bar: + for item, typee in QUEUED_IMAGES: post = bilibili.postScraper(item)["data"]["card"] try: QUEUED.append((post["display"]["attach_card"]["cover_url"], "ATTACH_CARD_COVER_URL")) @@ -35,7 +38,28 @@ for item, typee in QUEUED: for picture in card["item"]["pictures"]: QUEUED.append((picture["img_src"], "IMAGE")) QUEUED.append((card["user"]["head_url"], "PFP")) - progress += 1 - else: + bar() + +cprint.info("Finished image collection.") +cprint.info("Downloading articles...") + +with alive_bar() as bar: + for articles, __ in bilibili.articleScraper(PROFILE): + if not __.startswith("MASTER_LIST_"): + continue + for article in articles["data"]["articles"]: + for url in article["image_urls"]: + QUEUED.append((url, "ArticleImageUrl")) + bar() + for otherurl in article["origin_image_urls"]: + QUEUED.append((url, "ArticleOriginImgUrl")) + bar() + +cprint.info("Finished article download.") +cprint.info("Downloading queued images and posts...") + +with alive_bar(dual_line=True, total=len(QUEUED)) as bar: + for item, typee in QUEUED: + bar.text = f"{image} ({typee})" requests.get(item) - cprint.info(f"Downloaded {item} ({typee})") + bar()