Works?
This commit is contained in:
parent
61902ad0cd
commit
a91ee37b69
|
@ -1 +1,5 @@
|
|||
__pycache__/
|
||||
warcs/
|
||||
*.pem/
|
||||
*-warcprox-ca/
|
||||
warcprox*
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# Bilibili scraper
|
||||
This was made at someone else's request.
|
||||
|
||||
## Limitations
|
||||
- Cannot scrape video
|
||||
- Cannot scrape audio
|
||||
|
||||
## Requirements for wrapper.py
|
||||
- Python 3.7 (tested on 3.9)
|
||||
- alive_progress (installable via pip)
|
||||
- requests (installable via pip)
|
||||
- cprint (installable via pip)
|
||||
|
||||
## Requirements for the module
|
||||
- requests (installable via pip)
|
||||
|
||||
## To use
|
||||
Recommeded to use as a module rather than wrapper.py. `wrapper.py` is an example, but should work fine to scrape a user.
|
||||
|
||||
---
|
||||
IMPORTANT
|
||||
---
|
||||
|
||||
This comes with ABSOLUTELY NO WARRANTY, to the extent permitted by applicable law.
|
||||
Licenced under the Apache-2.0 licence. Copyright (c) 2022 TheTechRobo.
|
21
bilibili.py
21
bilibili.py
|
@ -14,6 +14,27 @@ def userScraper(id):
|
|||
for url, explanation in urls:
|
||||
yield requests.get(url).json(), explanation
|
||||
|
||||
def articleScraper(id):
|
||||
urls = []
|
||||
page = 1
|
||||
size = 12
|
||||
urls.append((f"https://api.bilibili.com/x/web-interface/card?mid={id}&article=true", "PROFILE_ARTICLE_CARD"))
|
||||
maxpage = requests.get(f"https://api.bilibili.com/x/space/navnum?mid={id}").json()["data"]["article"] / size
|
||||
while page <= maxpage:
|
||||
for sort_type in ["view", "publish_time", "fav"]:
|
||||
urls.append((f"https://api.bilibili.com/x/space/article?mid=233193626&pn={page}&ps={size}&sort={sort_type}&jsonp=jsonp", f"MASTER_LIST_{sort_type}"))
|
||||
page += 1
|
||||
urls.append((f"https://api.bilibili.com/x/article/up/lists?mid={id}&sort=0&jsonp=jsonp", "LISTS"))
|
||||
for url, expl in urls:
|
||||
yield requests.get(url).json(), expl
|
||||
|
||||
def individualArticleScraper(id):
|
||||
urls = []
|
||||
urls.append((f"https://api.bilibili.com/x/article/viewinfo?id={id}&mobi_app=pc&from=web", "VIEWINFO"))
|
||||
urls.append((f"https://www.bilibili.com/read/cv{id}", "readCV"))
|
||||
for url, expl in urls:
|
||||
yield requests.get(url).json(), expl
|
||||
|
||||
def albumScraper(id):
|
||||
page = 0
|
||||
size = 30
|
||||
|
|
46
wrapper.py
46
wrapper.py
|
@ -1,4 +1,5 @@
|
|||
import bilibili, json, requests
|
||||
from alive_progress import alive_bar
|
||||
from cprint import cprint
|
||||
|
||||
cprint.warn("PLEASE NOTE:\n\tThis is meant to be used with Warcprox. **NO DATA IS SAVED ANYWHERE.**")
|
||||
|
@ -10,16 +11,18 @@ PROFILE = input("Please enter the profile ID: ")
|
|||
|
||||
list(bilibili.userScraper(PROFILE)) # get metadata
|
||||
|
||||
for images, _ in bilibili.albumScraper(PROFILE):
|
||||
for image in images["data"]["items"]:
|
||||
QUEUED.append((image["dyn_id"], "IMAGE_POST"))
|
||||
cprint.info("Scraping album pagination for URLs...")
|
||||
|
||||
progress = 0
|
||||
with alive_bar() as bar:
|
||||
for images, _ in bilibili.albumScraper(PROFILE):
|
||||
for image in images["data"]["items"]:
|
||||
QUEUED_IMAGES.append((image["dyn_id"], "IMAGE_POST"))
|
||||
bar()
|
||||
|
||||
for item, typee in QUEUED:
|
||||
if typee == "IMAGE_POST":
|
||||
if progress % 10 == 0:
|
||||
cprint.ok(f"Scraping, {progress} results so far")
|
||||
cprint.info("Now downloading image metadata...")
|
||||
|
||||
with alive_bar(total=len(QUEUED_IMAGES)) as bar:
|
||||
for item, typee in QUEUED_IMAGES:
|
||||
post = bilibili.postScraper(item)["data"]["card"]
|
||||
try:
|
||||
QUEUED.append((post["display"]["attach_card"]["cover_url"], "ATTACH_CARD_COVER_URL"))
|
||||
|
@ -35,7 +38,28 @@ for item, typee in QUEUED:
|
|||
for picture in card["item"]["pictures"]:
|
||||
QUEUED.append((picture["img_src"], "IMAGE"))
|
||||
QUEUED.append((card["user"]["head_url"], "PFP"))
|
||||
progress += 1
|
||||
else:
|
||||
bar()
|
||||
|
||||
cprint.info("Finished image collection.")
|
||||
cprint.info("Downloading articles...")
|
||||
|
||||
with alive_bar() as bar:
|
||||
for articles, __ in bilibili.articleScraper(PROFILE):
|
||||
if not __.startswith("MASTER_LIST_"):
|
||||
continue
|
||||
for article in articles["data"]["articles"]:
|
||||
for url in article["image_urls"]:
|
||||
QUEUED.append((url, "ArticleImageUrl"))
|
||||
bar()
|
||||
for otherurl in article["origin_image_urls"]:
|
||||
QUEUED.append((url, "ArticleOriginImgUrl"))
|
||||
bar()
|
||||
|
||||
cprint.info("Finished article download.")
|
||||
cprint.info("Downloading queued images and posts...")
|
||||
|
||||
with alive_bar(dual_line=True, total=len(QUEUED)) as bar:
|
||||
for item, typee in QUEUED:
|
||||
bar.text = f"{image} ({typee})"
|
||||
requests.get(item)
|
||||
cprint.info(f"Downloaded {item} ({typee})")
|
||||
bar()
|
||||
|
|
Loading…
Reference in New Issue