Made that darn thing

2022-01-30 03:38:53 +01:00
commit 2373ce0416
5 changed files with 251 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+/logs/
+/downloads/
+/already_saved/
+
--- a/artists.yaml
+++ b/artists.yaml
@@ -0,0 +1,2 @@
+- mixppl
+- shumolly
--- a/grab.py
+++ b/grab.py
@@ -0,0 +1,228 @@
+import requests
+import mimetypes
+import os
+import sys
+import urllib.request
+import unicodedata
+import re
+from pathlib import Path
+from datetime import datetime
+import time
+
+# SYNPOSIS:
+# To download posts from an artist:
+# python3 grab.py mixppl
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+def logMsg(msg, mode):
+
+    col = 0
+    prefix = 0
+
+    if mode == "okdl":
+        col = bcolors.OKCYAN
+        prefix = "[OK_DL   ]"
+
+    elif mode == "okndl":
+        col = bcolors.OKBLUE
+        prefix = "[OK_NO_DL]"
+
+    elif mode == "warn":
+        col = bcolors.WARNING
+        prefix = "[WARNING ]"
+
+    elif mode == "err":
+        col = bcolors.FAIL
+        prefix = "[ERROR   ]"
+    else:
+        print(bcolors.FAIL + "SUPPLIED INVALID LOG MODE!!! USE EITHER okdl, okndl, warn, or err!")
+
+    timestamp = getCurrentTimestamp()
+
+    # Log to console
+    print(col + "[" + timestamp + "]: " + msg)
+
+    # Log to logfile
+    logfile = open("./logs/" + slugify(artist_name) + ".txt", "a") # Open existing or create
+    logfile.write(prefix + " " + "[" + timestamp + "]: " + msg + "\n")
+    logfile.close()
+
+
+def extensionFromUrl(url):
+    rurl = url[::-1]
+    rext = ""
+    for c in rurl:
+        if c != '.':
+            rext = rext + c
+        else:
+            break
+
+    ext = rext[::-1]
+
+    # Now remove the get parameters
+    foundQuestionmark = False
+    actualExt = ""
+    for c in ext:
+        if c == '?':
+            foundQuestionmark = True
+
+        if not foundQuestionmark:
+            actualExt = actualExt + c
+
+    return actualExt
+
+
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')
+
+
+def getCurrentTimestamp():
+    return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
+
+def isPostAlreadySaved(post_id):
+    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
+
+    # Does the index file even exist yet?
+    if not os.path.exists(idset_filename):
+        return False
+
+    # Open the index file
+    index_file = open(idset_filename, "r") # Open existing or create
+
+    # Store lines in array
+    already_downloaded_post_ids = index_file.readlines()
+
+    return (post_id + "\n") in already_downloaded_post_ids
+
+def markPostAsSaved(post_id):
+    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
+
+    # Open the index file
+    index_file = open(idset_filename, "a") # Open existing or create
+    index_file.write(post_id + "\n")
+    index_file.close()
+
+
+def downloadMedia(url, filename):
+    # Prepare and execute query to download images
+    opener = urllib.request.build_opener()
+    opener.addheaders = image_request_headers
+    urllib.request.install_opener(opener)
+    source = urllib.request.urlretrieve(asset_image_url, filename)
+
+
+project_fetch_headers = {
+    'authority': 'www.artstation.com',
+    'pragma': 'no-cache',
+    'cache-control': 'no-cache',
+    'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    'upgrade-insecure-requests': '1',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
+    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+    'sec-fetch-site': 'none',
+    'sec-fetch-mode': 'navigate',
+    'sec-fetch-user': '?1',
+    'sec-fetch-dest': 'document',
+    'accept-language': 'de-DE,de;q=0.9',
+    'authority': 'api.reddit.com'
+}
+
+image_request_headers = [
+    ('authority', 'cdna.artstation.com'),
+    ('pragma', 'no-cache'),
+    ('cache-control', 'no-cache'),
+    ('sec-ch-ua', '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"'),
+    ('sec-ch-ua-mobile', '?0'),
+    ('sec-ch-ua-platform', '"Windows"'),
+    ('upgrade-insecure-requests', '1'),
+    ('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'),
+    ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'),
+    ('sec-fetch-site', 'none'),
+    ('sec-fetch-mode', 'navigate'),
+    ('sec-fetch-user', '?1'),
+    ('sec-fetch-dest', 'document'),
+    ('accept-language', 'de-DE,de;q=0.9')
+]
+
+
+artist_name = str.lower(sys.argv[1])
+
+# Create artist directory if it doesn't exist
+artist_directory = "./downloads/" + slugify(artist_name) + "/"
+Path(artist_directory).mkdir(parents=True, exist_ok=True)
+
+# Create directory for already saved posts, and generate filename
+Path("./already_saved/").mkdir(parents=True, exist_ok=True)
+
+# Create directory for logging, and generate filename
+Path("./logs/").mkdir(parents=True, exist_ok=True)
+
+
+# Request project info for artist
+projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json", headers=project_fetch_headers)
+projects = projects_data.json()["data"]
+
+# For each project in all of the artists projects
+for project in projects:
+    project_name    = project["title"]
+    project_hash_id = project["hash_id"]
+
+    logMsg(f"Found project {project_name} with id {project_hash_id}. Fetching more info about it...", "okndl")
+
+    # Have we already downloaded this post?
+    if not isPostAlreadySaved(project_hash_id):
+
+        # Fetch information about the project
+        project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
+        assets = project_info.json()["assets"]
+
+        # For each asset in the project (might be multiple images)
+        for asset in assets:
+            asset_type = asset["asset_type"]
+
+            # If the asset is an image
+            if asset_type == "image":
+                asset_image_url = asset["image_url"]
+                asset_position = asset["position"]
+                
+                # Generate a download filename
+                filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
+
+                logMsg(f"Found image-asset for project {project_name} [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl")
+
+                # Download it
+                downloadMedia(asset_image_url, filename)
+            else:
+                logMsg(f"Found non-image-asset for project {project_name} [{project_hash_id}] at position {asset_position}. Skipping...", "okdl")
+
+        # After downloading all assets, mark the project as downloaded.
+        markPostAsSaved(project_hash_id)
+
+    # Project is already downloaded
+    else:
+        logMsg(f"Skipping project {project_name} [{project_hash_id}] because it is already downloaded.", "okndl")
--- a/launch.py
+++ b/launch.py
@@ -0,0 +1,13 @@
+import yaml
+import os
+
+with open("artists.yaml", "r") as yamlfile:
+    try:
+        config = yaml.safe_load(yamlfile)
+
+        for artist in config:
+            print(f"\033[92mGrabbing artist '{artist}'")
+            os.system(f"python3 grab.py '{artist}'")
+
+    except yaml.YAMLError as exc:
+        print("You fucked up the yaml format.")
--- a/required-packages.txt
+++ b/required-packages.txt
@@ -0,0 +1,4 @@
+install these via pip3
+
+request
+pyyaml