From 8daf6b5245dedf8b1ec804847da5f2ec1141050e Mon Sep 17 00:00:00 2001 From: Leonetienne Date: Sun, 30 Jan 2022 21:02:18 +0100 Subject: [PATCH] Added functionality to also grab search terms --- artists.yaml | 2 - grab-all.py | 21 +++++-- grab.py => grab-artist.py | 45 ++------------ grab-search.py | 121 ++++++++++++++++++++++++++++++++++++++ readme.md | 54 ++++++++++++++--- to-grab.yaml | 16 +++++ util.py | 34 +++++++++++ 7 files changed, 240 insertions(+), 53 deletions(-) delete mode 100644 artists.yaml rename grab.py => grab-artist.py (75%) create mode 100644 grab-search.py create mode 100644 to-grab.yaml diff --git a/artists.yaml b/artists.yaml deleted file mode 100644 index a6aa03e..0000000 --- a/artists.yaml +++ /dev/null @@ -1,2 +0,0 @@ -- mixppl -- shumolly diff --git a/grab-all.py b/grab-all.py index 12fd0b4..58d92ae 100644 --- a/grab-all.py +++ b/grab-all.py @@ -1,13 +1,26 @@ import yaml import os -with open("artists.yaml", "r") as yamlfile: +with open("to-grab.yaml", "r") as yamlfile: try: config = yaml.safe_load(yamlfile) - for artist in config: - print(f"\033[92mGrabbing artist '{artist}'") - os.system(f"python3 grab.py '{artist}'") + # Grab artists + if "artists" in config: + for artist in config["artists"]: + print(f"\033[92mGrabbing artist '{artist}'") + os.system(f"python3 grab-artist.py '{artist}'") + + # Grab search results + if "searches" in config: + for search in config["searches"]: + print(f"\033[92mGrabbing search results for '{search['terms']}'") + + max_results = "" + if "max" in search: + max_results = search["max"] + + os.system("python3 grab-search.py '" + search['terms'] + "' " + str(max_results)) except yaml.YAMLError as exc: print("You fucked up the yaml format.") diff --git a/grab.py b/grab-artist.py similarity index 75% rename from grab.py rename to grab-artist.py index ce7693a..c3ca184 100644 --- a/grab.py +++ b/grab-artist.py @@ -1,8 +1,6 @@ import requests import mimetypes -import os import sys -import urllib.request from pathlib import Path from datetime import datetime import time @@ -13,38 +11,7 @@ from headers import * # SYNPOSIS: # To download posts from an artist: -# python3 grab.py mixppl - -def isPostAlreadySaved(post_id): - idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" - - # Does the index file even exist yet? - if not os.path.exists(idset_filename): - return False - - # Open the index file - index_file = open(idset_filename, "r") # Open existing or create - - # Store lines in array - already_downloaded_post_ids = index_file.readlines() - - return (post_id + "\n") in already_downloaded_post_ids - -def markPostAsSaved(post_id): - idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" - - # Open the index file - index_file = open(idset_filename, "a") # Open existing or create - index_file.write(post_id + "\n") - index_file.close() - - -def downloadMedia(url, filename): - # Prepare and execute query to download images - opener = urllib.request.build_opener() - opener.addheaders = image_request_headers - urllib.request.install_opener(opener) - source = urllib.request.urlretrieve(asset_image_url, filename) +# python3 grab-artist.py mixppl # 2 minute timeout in case something gets stuck. socket.setdefaulttimeout(120) @@ -90,7 +57,7 @@ try: logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name) # Have we already downloaded this post? - if not isPostAlreadySaved(project_hash_id): + if not isPostAlreadySaved(project_hash_id, artist_name): # Fetch information about the project project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers) @@ -116,7 +83,7 @@ try: logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name) # After downloading all assets, mark the project as downloaded. - markPostAsSaved(project_hash_id) + markPostAsSaved(project_hash_id, artist_name) # Project is already downloaded else: @@ -124,7 +91,7 @@ try: logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name) -except socket.timeout: +except socket.timeout as exc: logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name) -except: - logMsg("Failed for some reason!", "err", artist_name) +except BaseException as exc: + logMsg("Failed for some reason!: " + repr(exc), "err", artist_name) diff --git a/grab-search.py b/grab-search.py new file mode 100644 index 0000000..1663a96 --- /dev/null +++ b/grab-search.py @@ -0,0 +1,121 @@ +import requests +import mimetypes +import sys +from pathlib import Path +from datetime import datetime +import time +import socket + +from util import * +from headers import * + +# SYNPOSIS: +# To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call +# python3 grab-search.py "game of thrones" 100 +# If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example). + +# 2 minute timeout in case something gets stuck. +socket.setdefaulttimeout(120) + +search_terms = str.lower(sys.argv[1]) +search_terms_filename = "search_" + slugify(search_terms) + +max_projects = sys.maxsize +# Is max-posts specified? +if len(sys.argv) >= 3: + max_projects = int(sys.argv[2]) + +# Create artist directory if it doesn't exist +artist_directory = "./downloads/" + search_terms_filename + "/" +Path(artist_directory).mkdir(parents=True, exist_ok=True) + +# Create directory for already saved posts, and generate filename +Path("./already_saved/").mkdir(parents=True, exist_ok=True) + +# Create directory for logging, and generate filename +Path("./logs/").mkdir(parents=True, exist_ok=True) + +if max_projects == sys.maxsize: + logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename) +else: + logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename) + +# Request project info for artist +lastPageReached = False +pageCounter = 1 +projectCounter = 0 +try: + while not lastPageReached: + logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename) + projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers) + projects = projects_data.json()["data"] + result_size = projects_data.json()["total_count"] + page_num_projects = len(projects) + + + + lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page + + if not lastPageReached: + pageCounter = pageCounter + 1 + logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename) + else: + logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename) + + + # For each project in all of the artists projects + for project in projects: + if projectCounter >= max_projects: + logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename) + exit(0) + + project_name = project["title"] + project_hash_id = project["hash_id"] + project_artist_name = project["user"]["username"] + project_artist_name_fullname = project["user"]["full_name"] + + logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename) + + # Have we already downloaded this post? + if not isPostAlreadySaved(project_hash_id, search_terms_filename): + + # Fetch information about the project + project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers) + assets = project_info.json()["assets"] + + # For each asset in the project (might be multiple images) + for asset in assets: + asset_type = asset["asset_type"] + + # If the asset is an image + if asset_type == "image": + asset_image_url = asset["image_url"] + asset_position = asset["position"] + + # Generate a download filename + filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url) + + logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename) + + # Download it + downloadMedia(asset_image_url, filename) + else: + logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename) + + # After downloading all assets, mark the project as downloaded. + markPostAsSaved(project_hash_id, search_terms_filename) + projectCounter = projectCounter + 1 + + # Project is already downloaded + else: + logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename) + + logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename) + +except socket.timeout as exc: + logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename) +except SystemExit: + # That's... why i'm here + exit(0) +except BaseException as exc: + logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename) diff --git a/readme.md b/readme.md index 6cedbdd..8a3d055 100644 --- a/readme.md +++ b/readme.md @@ -3,6 +3,9 @@ This is a personal and untested tool to keep (and update) local copies of artsta Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance. No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM. +Also supports to download search results. + + Currently only working for images. Feel free to issue a pull request if you want more. ## Setup @@ -13,22 +16,57 @@ pip3 install requests pyyaml All scripts require Python3. Tested using 3.9.9. ## Running it -Here we have two scripts: +Here we have three scripts: + +### Grab an artists profile ```bash -grab.py $artist-name +grab-artist.py 'mixppl' ``` -This will grab one individual profile. +This will grab one individual profile, in this case the user 'mixppl'. You must use the username in profiles url! Not the full name! ---- +### Grab search results +```bash +grab-search.py 'Game of Thrones' 100 +``` +This will grab the first 100 results of the search term 'Game of Thrones'. +If you omit the result limit, **ALL** results will be downloaded! That could be useful, if your search query is very niche. But if you omit it for a popular search term, like 'Game of Thrones', you're in for a ride, +as all approx 12000 projects will be queued for download. +### Automate it +### Invoke a scan ```bash grab-all.py ``` -This will call `grab.py` on all artists listed in `artists.yaml`. +This will call `grab.py` on all artists and search terms listed in `to-grab.yaml`. + +Files will be saved to `./downloads/{artist_name}/*.{ext}` and `/downloads/search_{search_terms}/*{artist_id}_*.{ext}`. +Logs will be saved to `./logs/{artist_name/search_terms}.txt`. +Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name/search_terms}.txt`. + +> :warning: Projects already downloaded from an artists-page will be downloaded **again** if they appear in a search term, and vica versa. Artist- and search queries do NOT share download indices! + +### Configure what to download +Simply adjust [`to-grab.yaml`](https://github.com/Leonetienne/Artstation-grabber/blob/master/to-grab.yaml) to your needs. Here is an example: +``` +--- +artists: + - mixppl + - shumolly + +searches: + - + terms: Game of Thrones + max: 3 + + - + terms: Pirates + max: 3 + + - + terms: robby rotton +``` +The last search term, 'robby rotton' shows that you can also omit `max`. If you do not want to fetch artists, or searches, at all, just delete that yaml-array entirely. -Files will be saved to `./downloads/{artist_name}/*.{ext}`. -Logs will be saved to `./logs/{artist_name}.txt`. -Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name}.txt`. ## A word on power usage Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should! diff --git a/to-grab.yaml b/to-grab.yaml new file mode 100644 index 0000000..816d89d --- /dev/null +++ b/to-grab.yaml @@ -0,0 +1,16 @@ +--- +artists: + - mixppl + - shumolly + +searches: + - + terms: Game of Thrones + max: 3 + + - + terms: Pirates + max: 3 + + - + terms: robby rotton diff --git a/util.py b/util.py index 9c0801c..456fc8c 100644 --- a/util.py +++ b/util.py @@ -3,6 +3,9 @@ from datetime import datetime import time from pathlib import Path import re +import os +import urllib.request +from headers import * class bcolors: HEADER = '\033[95m' @@ -92,3 +95,34 @@ def slugify(value, allow_unicode=False): def getCurrentTimestamp(): return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M") + +def isPostAlreadySaved(post_id, artist_name): + idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" + + # Does the index file even exist yet? + if not os.path.exists(idset_filename): + return False + + # Open the index file + index_file = open(idset_filename, "r") # Open existing or create + + # Store lines in array + already_downloaded_post_ids = index_file.readlines() + + return (post_id + "\n") in already_downloaded_post_ids + +def markPostAsSaved(post_id, artist_name): + idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" + + # Open the index file + index_file = open(idset_filename, "a") # Open existing or create + index_file.write(post_id + "\n") + index_file.close() + + +def downloadMedia(url, filename): + # Prepare and execute query to download images + opener = urllib.request.build_opener() + opener.addheaders = image_request_headers + urllib.request.install_opener(opener) + source = urllib.request.urlretrieve(url, filename)