Works?
This commit is contained in:
parent
61902ad0cd
commit
a91ee37b69
|
@ -1 +1,5 @@
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
warcs/
|
||||||
|
*.pem/
|
||||||
|
*-warcprox-ca/
|
||||||
|
warcprox*
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Bilibili scraper
|
||||||
|
This was made at someone else's request.
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
- Cannot scrape video
|
||||||
|
- Cannot scrape audio
|
||||||
|
|
||||||
|
## Requirements for wrapper.py
|
||||||
|
- Python 3.7 (tested on 3.9)
|
||||||
|
- alive_progress (installable via pip)
|
||||||
|
- requests (installable via pip)
|
||||||
|
- cprint (installable via pip)
|
||||||
|
|
||||||
|
## Requirements for the module
|
||||||
|
- requests (installable via pip)
|
||||||
|
|
||||||
|
## To use
|
||||||
|
Recommeded to use as a module rather than wrapper.py. `wrapper.py` is an example, but should work fine to scrape a user.
|
||||||
|
|
||||||
|
---
|
||||||
|
IMPORTANT
|
||||||
|
---
|
||||||
|
|
||||||
|
This comes with ABSOLUTELY NO WARRANTY, to the extent permitted by applicable law.
|
||||||
|
Licenced under the Apache-2.0 licence. Copyright (c) 2022 TheTechRobo.
|
21
bilibili.py
21
bilibili.py
|
@ -14,6 +14,27 @@ def userScraper(id):
|
||||||
for url, explanation in urls:
|
for url, explanation in urls:
|
||||||
yield requests.get(url).json(), explanation
|
yield requests.get(url).json(), explanation
|
||||||
|
|
||||||
|
def articleScraper(id):
|
||||||
|
urls = []
|
||||||
|
page = 1
|
||||||
|
size = 12
|
||||||
|
urls.append((f"https://api.bilibili.com/x/web-interface/card?mid={id}&article=true", "PROFILE_ARTICLE_CARD"))
|
||||||
|
maxpage = requests.get(f"https://api.bilibili.com/x/space/navnum?mid={id}").json()["data"]["article"] / size
|
||||||
|
while page <= maxpage:
|
||||||
|
for sort_type in ["view", "publish_time", "fav"]:
|
||||||
|
urls.append((f"https://api.bilibili.com/x/space/article?mid=233193626&pn={page}&ps={size}&sort={sort_type}&jsonp=jsonp", f"MASTER_LIST_{sort_type}"))
|
||||||
|
page += 1
|
||||||
|
urls.append((f"https://api.bilibili.com/x/article/up/lists?mid={id}&sort=0&jsonp=jsonp", "LISTS"))
|
||||||
|
for url, expl in urls:
|
||||||
|
yield requests.get(url).json(), expl
|
||||||
|
|
||||||
|
def individualArticleScraper(id):
|
||||||
|
urls = []
|
||||||
|
urls.append((f"https://api.bilibili.com/x/article/viewinfo?id={id}&mobi_app=pc&from=web", "VIEWINFO"))
|
||||||
|
urls.append((f"https://www.bilibili.com/read/cv{id}", "readCV"))
|
||||||
|
for url, expl in urls:
|
||||||
|
yield requests.get(url).json(), expl
|
||||||
|
|
||||||
def albumScraper(id):
|
def albumScraper(id):
|
||||||
page = 0
|
page = 0
|
||||||
size = 30
|
size = 30
|
||||||
|
|
46
wrapper.py
46
wrapper.py
|
@ -1,4 +1,5 @@
|
||||||
import bilibili, json, requests
|
import bilibili, json, requests
|
||||||
|
from alive_progress import alive_bar
|
||||||
from cprint import cprint
|
from cprint import cprint
|
||||||
|
|
||||||
cprint.warn("PLEASE NOTE:\n\tThis is meant to be used with Warcprox. **NO DATA IS SAVED ANYWHERE.**")
|
cprint.warn("PLEASE NOTE:\n\tThis is meant to be used with Warcprox. **NO DATA IS SAVED ANYWHERE.**")
|
||||||
|
@ -10,16 +11,18 @@ PROFILE = input("Please enter the profile ID: ")
|
||||||
|
|
||||||
list(bilibili.userScraper(PROFILE)) # get metadata
|
list(bilibili.userScraper(PROFILE)) # get metadata
|
||||||
|
|
||||||
for images, _ in bilibili.albumScraper(PROFILE):
|
cprint.info("Scraping album pagination for URLs...")
|
||||||
for image in images["data"]["items"]:
|
|
||||||
QUEUED.append((image["dyn_id"], "IMAGE_POST"))
|
|
||||||
|
|
||||||
progress = 0
|
with alive_bar() as bar:
|
||||||
|
for images, _ in bilibili.albumScraper(PROFILE):
|
||||||
|
for image in images["data"]["items"]:
|
||||||
|
QUEUED_IMAGES.append((image["dyn_id"], "IMAGE_POST"))
|
||||||
|
bar()
|
||||||
|
|
||||||
for item, typee in QUEUED:
|
cprint.info("Now downloading image metadata...")
|
||||||
if typee == "IMAGE_POST":
|
|
||||||
if progress % 10 == 0:
|
with alive_bar(total=len(QUEUED_IMAGES)) as bar:
|
||||||
cprint.ok(f"Scraping, {progress} results so far")
|
for item, typee in QUEUED_IMAGES:
|
||||||
post = bilibili.postScraper(item)["data"]["card"]
|
post = bilibili.postScraper(item)["data"]["card"]
|
||||||
try:
|
try:
|
||||||
QUEUED.append((post["display"]["attach_card"]["cover_url"], "ATTACH_CARD_COVER_URL"))
|
QUEUED.append((post["display"]["attach_card"]["cover_url"], "ATTACH_CARD_COVER_URL"))
|
||||||
|
@ -35,7 +38,28 @@ for item, typee in QUEUED:
|
||||||
for picture in card["item"]["pictures"]:
|
for picture in card["item"]["pictures"]:
|
||||||
QUEUED.append((picture["img_src"], "IMAGE"))
|
QUEUED.append((picture["img_src"], "IMAGE"))
|
||||||
QUEUED.append((card["user"]["head_url"], "PFP"))
|
QUEUED.append((card["user"]["head_url"], "PFP"))
|
||||||
progress += 1
|
bar()
|
||||||
else:
|
|
||||||
|
cprint.info("Finished image collection.")
|
||||||
|
cprint.info("Downloading articles...")
|
||||||
|
|
||||||
|
with alive_bar() as bar:
|
||||||
|
for articles, __ in bilibili.articleScraper(PROFILE):
|
||||||
|
if not __.startswith("MASTER_LIST_"):
|
||||||
|
continue
|
||||||
|
for article in articles["data"]["articles"]:
|
||||||
|
for url in article["image_urls"]:
|
||||||
|
QUEUED.append((url, "ArticleImageUrl"))
|
||||||
|
bar()
|
||||||
|
for otherurl in article["origin_image_urls"]:
|
||||||
|
QUEUED.append((url, "ArticleOriginImgUrl"))
|
||||||
|
bar()
|
||||||
|
|
||||||
|
cprint.info("Finished article download.")
|
||||||
|
cprint.info("Downloading queued images and posts...")
|
||||||
|
|
||||||
|
with alive_bar(dual_line=True, total=len(QUEUED)) as bar:
|
||||||
|
for item, typee in QUEUED:
|
||||||
|
bar.text = f"{image} ({typee})"
|
||||||
requests.get(item)
|
requests.get(item)
|
||||||
cprint.info(f"Downloaded {item} ({typee})")
|
bar()
|
||||||
|
|
Loading…
Reference in New Issue