This commit is contained in:
TheTechRobo 2022-03-25 17:22:44 -04:00
parent 61902ad0cd
commit a91ee37b69
4 changed files with 85 additions and 11 deletions

4
.gitignore vendored
View File

@ -1 +1,5 @@
__pycache__/
warcs/
*.pem/
*-warcprox-ca/
warcprox*

25
README.md Normal file
View File

@ -0,0 +1,25 @@
# Bilibili scraper
This was made at someone else's request.
## Limitations
- Cannot scrape video
- Cannot scrape audio
## Requirements for wrapper.py
- Python 3.7 (tested on 3.9)
- alive_progress (installable via pip)
- requests (installable via pip)
- cprint (installable via pip)
## Requirements for the module
- requests (installable via pip)
## To use
Recommeded to use as a module rather than wrapper.py. `wrapper.py` is an example, but should work fine to scrape a user.
---
IMPORTANT
---
This comes with ABSOLUTELY NO WARRANTY, to the extent permitted by applicable law.
Licenced under the Apache-2.0 licence. Copyright (c) 2022 TheTechRobo.

View File

@ -14,6 +14,27 @@ def userScraper(id):
for url, explanation in urls:
yield requests.get(url).json(), explanation
def articleScraper(id):
urls = []
page = 1
size = 12
urls.append((f"https://api.bilibili.com/x/web-interface/card?mid={id}&article=true", "PROFILE_ARTICLE_CARD"))
maxpage = requests.get(f"https://api.bilibili.com/x/space/navnum?mid={id}").json()["data"]["article"] / size
while page <= maxpage:
for sort_type in ["view", "publish_time", "fav"]:
urls.append((f"https://api.bilibili.com/x/space/article?mid=233193626&pn={page}&ps={size}&sort={sort_type}&jsonp=jsonp", f"MASTER_LIST_{sort_type}"))
page += 1
urls.append((f"https://api.bilibili.com/x/article/up/lists?mid={id}&sort=0&jsonp=jsonp", "LISTS"))
for url, expl in urls:
yield requests.get(url).json(), expl
def individualArticleScraper(id):
urls = []
urls.append((f"https://api.bilibili.com/x/article/viewinfo?id={id}&mobi_app=pc&from=web", "VIEWINFO"))
urls.append((f"https://www.bilibili.com/read/cv{id}", "readCV"))
for url, expl in urls:
yield requests.get(url).json(), expl
def albumScraper(id):
page = 0
size = 30

View File

@ -1,4 +1,5 @@
import bilibili, json, requests
from alive_progress import alive_bar
from cprint import cprint
cprint.warn("PLEASE NOTE:\n\tThis is meant to be used with Warcprox. **NO DATA IS SAVED ANYWHERE.**")
@ -10,16 +11,18 @@ PROFILE = input("Please enter the profile ID: ")
list(bilibili.userScraper(PROFILE)) # get metadata
for images, _ in bilibili.albumScraper(PROFILE):
for image in images["data"]["items"]:
QUEUED.append((image["dyn_id"], "IMAGE_POST"))
cprint.info("Scraping album pagination for URLs...")
progress = 0
with alive_bar() as bar:
for images, _ in bilibili.albumScraper(PROFILE):
for image in images["data"]["items"]:
QUEUED_IMAGES.append((image["dyn_id"], "IMAGE_POST"))
bar()
for item, typee in QUEUED:
if typee == "IMAGE_POST":
if progress % 10 == 0:
cprint.ok(f"Scraping, {progress} results so far")
cprint.info("Now downloading image metadata...")
with alive_bar(total=len(QUEUED_IMAGES)) as bar:
for item, typee in QUEUED_IMAGES:
post = bilibili.postScraper(item)["data"]["card"]
try:
QUEUED.append((post["display"]["attach_card"]["cover_url"], "ATTACH_CARD_COVER_URL"))
@ -35,7 +38,28 @@ for item, typee in QUEUED:
for picture in card["item"]["pictures"]:
QUEUED.append((picture["img_src"], "IMAGE"))
QUEUED.append((card["user"]["head_url"], "PFP"))
progress += 1
else:
bar()
cprint.info("Finished image collection.")
cprint.info("Downloading articles...")
with alive_bar() as bar:
for articles, __ in bilibili.articleScraper(PROFILE):
if not __.startswith("MASTER_LIST_"):
continue
for article in articles["data"]["articles"]:
for url in article["image_urls"]:
QUEUED.append((url, "ArticleImageUrl"))
bar()
for otherurl in article["origin_image_urls"]:
QUEUED.append((url, "ArticleOriginImgUrl"))
bar()
cprint.info("Finished article download.")
cprint.info("Downloading queued images and posts...")
with alive_bar(dual_line=True, total=len(QUEUED)) as bar:
for item, typee in QUEUED:
bar.text = f"{image} ({typee})"
requests.get(item)
cprint.info(f"Downloaded {item} ({typee})")
bar()