From 2373ce0416c6dae447580f3daceb5ebc30c9a5e1 Mon Sep 17 00:00:00 2001 From: Leonetienne Date: Sun, 30 Jan 2022 03:38:53 +0100 Subject: [PATCH] Made that darn thing --- .gitignore | 4 + artists.yaml | 2 + grab.py | 228 ++++++++++++++++++++++++++++++++++++++++++ launch.py | 13 +++ required-packages.txt | 4 + 5 files changed, 251 insertions(+) create mode 100644 .gitignore create mode 100644 artists.yaml create mode 100644 grab.py create mode 100644 launch.py create mode 100644 required-packages.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a682ba9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/logs/ +/downloads/ +/already_saved/ + diff --git a/artists.yaml b/artists.yaml new file mode 100644 index 0000000..a6aa03e --- /dev/null +++ b/artists.yaml @@ -0,0 +1,2 @@ +- mixppl +- shumolly diff --git a/grab.py b/grab.py new file mode 100644 index 0000000..4854e6b --- /dev/null +++ b/grab.py @@ -0,0 +1,228 @@ +import requests +import mimetypes +import os +import sys +import urllib.request +import unicodedata +import re +from pathlib import Path +from datetime import datetime +import time + +# SYNPOSIS: +# To download posts from an artist: +# python3 grab.py mixppl + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def logMsg(msg, mode): + + col = 0 + prefix = 0 + + if mode == "okdl": + col = bcolors.OKCYAN + prefix = "[OK_DL ]" + + elif mode == "okndl": + col = bcolors.OKBLUE + prefix = "[OK_NO_DL]" + + elif mode == "warn": + col = bcolors.WARNING + prefix = "[WARNING ]" + + elif mode == "err": + col = bcolors.FAIL + prefix = "[ERROR ]" + else: + print(bcolors.FAIL + "SUPPLIED INVALID LOG MODE!!! USE EITHER okdl, okndl, warn, or err!") + + timestamp = getCurrentTimestamp() + + # Log to console + print(col + "[" + timestamp + "]: " + msg) + + # Log to logfile + logfile = open("./logs/" + slugify(artist_name) + ".txt", "a") # Open existing or create + logfile.write(prefix + " " + "[" + timestamp + "]: " + msg + "\n") + logfile.close() + + +def extensionFromUrl(url): + rurl = url[::-1] + rext = "" + for c in rurl: + if c != '.': + rext = rext + c + else: + break + + ext = rext[::-1] + + # Now remove the get parameters + foundQuestionmark = False + actualExt = "" + for c in ext: + if c == '?': + foundQuestionmark = True + + if not foundQuestionmark: + actualExt = actualExt + c + + return actualExt + + +def slugify(value, allow_unicode=False): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + if allow_unicode: + value = unicodedata.normalize('NFKC', value) + else: + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') + + +def getCurrentTimestamp(): + return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M") + +def isPostAlreadySaved(post_id): + idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" + + # Does the index file even exist yet? + if not os.path.exists(idset_filename): + return False + + # Open the index file + index_file = open(idset_filename, "r") # Open existing or create + + # Store lines in array + already_downloaded_post_ids = index_file.readlines() + + return (post_id + "\n") in already_downloaded_post_ids + +def markPostAsSaved(post_id): + idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" + + # Open the index file + index_file = open(idset_filename, "a") # Open existing or create + index_file.write(post_id + "\n") + index_file.close() + + +def downloadMedia(url, filename): + # Prepare and execute query to download images + opener = urllib.request.build_opener() + opener.addheaders = image_request_headers + urllib.request.install_opener(opener) + source = urllib.request.urlretrieve(asset_image_url, filename) + + +project_fetch_headers = { + 'authority': 'www.artstation.com', + 'pragma': 'no-cache', + 'cache-control': 'no-cache', + 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'sec-fetch-site': 'none', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-user': '?1', + 'sec-fetch-dest': 'document', + 'accept-language': 'de-DE,de;q=0.9', + 'authority': 'api.reddit.com' +} + +image_request_headers = [ + ('authority', 'cdna.artstation.com'), + ('pragma', 'no-cache'), + ('cache-control', 'no-cache'), + ('sec-ch-ua', '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"'), + ('sec-ch-ua-mobile', '?0'), + ('sec-ch-ua-platform', '"Windows"'), + ('upgrade-insecure-requests', '1'), + ('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'), + ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'), + ('sec-fetch-site', 'none'), + ('sec-fetch-mode', 'navigate'), + ('sec-fetch-user', '?1'), + ('sec-fetch-dest', 'document'), + ('accept-language', 'de-DE,de;q=0.9') +] + + +artist_name = str.lower(sys.argv[1]) + +# Create artist directory if it doesn't exist +artist_directory = "./downloads/" + slugify(artist_name) + "/" +Path(artist_directory).mkdir(parents=True, exist_ok=True) + +# Create directory for already saved posts, and generate filename +Path("./already_saved/").mkdir(parents=True, exist_ok=True) + +# Create directory for logging, and generate filename +Path("./logs/").mkdir(parents=True, exist_ok=True) + + +# Request project info for artist +projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json", headers=project_fetch_headers) +projects = projects_data.json()["data"] + +# For each project in all of the artists projects +for project in projects: + project_name = project["title"] + project_hash_id = project["hash_id"] + + logMsg(f"Found project {project_name} with id {project_hash_id}. Fetching more info about it...", "okndl") + + # Have we already downloaded this post? + if not isPostAlreadySaved(project_hash_id): + + # Fetch information about the project + project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers) + assets = project_info.json()["assets"] + + # For each asset in the project (might be multiple images) + for asset in assets: + asset_type = asset["asset_type"] + + # If the asset is an image + if asset_type == "image": + asset_image_url = asset["image_url"] + asset_position = asset["position"] + + # Generate a download filename + filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url) + + logMsg(f"Found image-asset for project {project_name} [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl") + + # Download it + downloadMedia(asset_image_url, filename) + else: + logMsg(f"Found non-image-asset for project {project_name} [{project_hash_id}] at position {asset_position}. Skipping...", "okdl") + + # After downloading all assets, mark the project as downloaded. + markPostAsSaved(project_hash_id) + + # Project is already downloaded + else: + logMsg(f"Skipping project {project_name} [{project_hash_id}] because it is already downloaded.", "okndl") diff --git a/launch.py b/launch.py new file mode 100644 index 0000000..12fd0b4 --- /dev/null +++ b/launch.py @@ -0,0 +1,13 @@ +import yaml +import os + +with open("artists.yaml", "r") as yamlfile: + try: + config = yaml.safe_load(yamlfile) + + for artist in config: + print(f"\033[92mGrabbing artist '{artist}'") + os.system(f"python3 grab.py '{artist}'") + + except yaml.YAMLError as exc: + print("You fucked up the yaml format.") diff --git a/required-packages.txt b/required-packages.txt new file mode 100644 index 0000000..03a040b --- /dev/null +++ b/required-packages.txt @@ -0,0 +1,4 @@ +install these via pip3 + +request +pyyaml