Added functionality to also grab search terms

2022-01-30 21:02:18 +01:00
parent b37105afa9
commit 8daf6b5245
7 changed files with 240 additions and 53 deletions
--- a/artists.yaml
+++ b/artists.yaml
@@ -1,2 +0,0 @@
 - mixppl
 - shumolly
--- a/grab-all.py
+++ b/grab-all.py
@@ -1,13 +1,26 @@
 import yaml
 import os
-with open("artists.yaml", "r") as yamlfile:
+with open("to-grab.yaml", "r") as yamlfile:
    try:
        config = yaml.safe_load(yamlfile)
-        for artist in config:
+        # Grab artists
-            print(f"\033[92mGrabbing artist '{artist}'")
+        if "artists" in config:
-            os.system(f"python3 grab.py '{artist}'")
+            for artist in config["artists"]:
                print(f"\033[92mGrabbing artist '{artist}'")
                os.system(f"python3 grab-artist.py '{artist}'")
        # Grab search results
        if "searches" in config:
            for search in config["searches"]:
                print(f"\033[92mGrabbing search results for '{search['terms']}'")
                max_results = ""
                if "max" in search:
                    max_results = search["max"]
                os.system("python3 grab-search.py '" + search['terms'] + "' " + str(max_results))
    except yaml.YAMLError as exc:
        print("You fucked up the yaml format.")
--- a/grab-artist.py
+++ b/grab-artist.py
@@ -1,8 +1,6 @@
 import requests
 import mimetypes
 import os
 import sys
 import urllib.request
 from pathlib import Path
 from datetime import datetime
 import time
@@ -13,38 +11,7 @@ from headers import *
 # SYNPOSIS:
 # To download posts from an artist:
-# python3 grab.py mixppl
+# python3 grab-artist.py mixppl
 def isPostAlreadySaved(post_id):
    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
    # Does the index file even exist yet?
    if not os.path.exists(idset_filename):
        return False
    # Open the index file
    index_file = open(idset_filename, "r") # Open existing or create
    # Store lines in array
    already_downloaded_post_ids = index_file.readlines()
    return (post_id + "\n") in already_downloaded_post_ids
 def markPostAsSaved(post_id):
    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
    # Open the index file
    index_file = open(idset_filename, "a") # Open existing or create
    index_file.write(post_id + "\n")
    index_file.close()
 def downloadMedia(url, filename):
    # Prepare and execute query to download images
    opener = urllib.request.build_opener()
    opener.addheaders = image_request_headers
    urllib.request.install_opener(opener)
    source = urllib.request.urlretrieve(asset_image_url, filename)
 # 2 minute timeout in case something gets stuck.
 socket.setdefaulttimeout(120)
@@ -90,7 +57,7 @@ try:
            logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
            # Have we already downloaded this post?
-            if not isPostAlreadySaved(project_hash_id):
+            if not isPostAlreadySaved(project_hash_id, artist_name):
                # Fetch information about the project
                project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
@@ -116,7 +83,7 @@ try:
                        logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
                # After downloading all assets, mark the project as downloaded.
-                markPostAsSaved(project_hash_id)
+                markPostAsSaved(project_hash_id, artist_name)
            # Project is already downloaded
            else:
@@ -124,7 +91,7 @@ try:
    logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
-except socket.timeout:
+except socket.timeout as exc:
    logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
-except:
+except BaseException as exc:
-    logMsg("Failed for some reason!", "err", artist_name)
+    logMsg("Failed for some reason!: " + repr(exc), "err", artist_name)
--- a/grab-search.py
+++ b/grab-search.py
@@ -0,0 +1,121 @@
 import requests
 import mimetypes
 import sys
 from pathlib import Path
 from datetime import datetime
 import time
 import socket
 from util import *
 from headers import *
 # SYNPOSIS:
 # To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call
 # python3 grab-search.py "game of thrones" 100
 # If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example).
 # 2 minute timeout in case something gets stuck.
 socket.setdefaulttimeout(120)
 search_terms = str.lower(sys.argv[1])
 search_terms_filename = "search_" + slugify(search_terms)
 max_projects = sys.maxsize
 # Is max-posts specified?
 if len(sys.argv) >= 3:
    max_projects = int(sys.argv[2])
 # Create artist directory if it doesn't exist
 artist_directory = "./downloads/" + search_terms_filename + "/"
 Path(artist_directory).mkdir(parents=True, exist_ok=True)
 # Create directory for already saved posts, and generate filename
 Path("./already_saved/").mkdir(parents=True, exist_ok=True)
 # Create directory for logging, and generate filename
 Path("./logs/").mkdir(parents=True, exist_ok=True)
 if max_projects == sys.maxsize:
    logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename)
 else:
    logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename)
 # Request project info for artist
 lastPageReached = False
 pageCounter = 1
 projectCounter = 0
 try:
    while not lastPageReached:
        logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename)
        projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers)
        projects = projects_data.json()["data"]
        result_size = projects_data.json()["total_count"]
        page_num_projects = len(projects)
        lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page
        if not lastPageReached:
            pageCounter = pageCounter + 1
            logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename)
        else:
            logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename)
        # For each project in all of the artists projects
        for project in projects:
            if projectCounter >= max_projects:
                logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename)
                exit(0)
            project_name        = project["title"]
            project_hash_id     = project["hash_id"]
            project_artist_name = project["user"]["username"]
            project_artist_name_fullname = project["user"]["full_name"]
            logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename)
            # Have we already downloaded this post?
            if not isPostAlreadySaved(project_hash_id, search_terms_filename):
                # Fetch information about the project
                project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
                assets = project_info.json()["assets"]
                # For each asset in the project (might be multiple images)
                for asset in assets:
                    asset_type = asset["asset_type"]
                    # If the asset is an image
                    if asset_type == "image":
                        asset_image_url = asset["image_url"]
                        asset_position = asset["position"]
                        # Generate a download filename
                        filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
                        logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename)
                        # Download it
                        downloadMedia(asset_image_url, filename)
                    else:
                        logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename)
                # After downloading all assets, mark the project as downloaded.
                markPostAsSaved(project_hash_id, search_terms_filename)
                projectCounter = projectCounter + 1
            # Project is already downloaded
            else:
                logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename)
    logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename)
 except socket.timeout as exc:
    logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename)
 except SystemExit:
    # That's... why i'm here
    exit(0)
 except BaseException as exc:
    logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename)
--- a/readme.md
+++ b/readme.md
@@ -3,6 +3,9 @@ This is a personal and untested tool to keep (and update) local copies of artsta
 Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance.
 No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM.
 Also supports to download search results.
 Currently only working for images. Feel free to issue a pull request if you want more.
 ## Setup
@@ -13,22 +16,57 @@ pip3 install requests pyyaml
 All scripts require Python3. Tested using 3.9.9.
 ## Running it
-Here we have two scripts:
+Here we have three scripts:
 ### Grab an artists profile
 ```bash
-grab.py $artist-name
+grab-artist.py 'mixppl'
 ```
-This will grab one individual profile.
+This will grab one individual profile, in this case the user 'mixppl'. You must use the username in profiles url! Not the full name!
---
+### Grab search results
 ```bash
 grab-search.py 'Game of Thrones' 100
 ```
 This will grab the first 100 results of the search term 'Game of Thrones'.
 If you omit the result limit, **ALL** results will be downloaded! That could be useful, if your search query is very niche. But if you omit it for a popular search term, like 'Game of Thrones', you're in for a ride,
 as all approx 12000 projects will be queued for download.
 ### Automate it
 ### Invoke a scan
 ```bash
 grab-all.py
 ```
-This will call `grab.py` on all artists listed in `artists.yaml`.
+This will call `grab.py` on all artists and search terms listed in `to-grab.yaml`.
 Files will be saved to `./downloads/{artist_name}/*.{ext}` and `/downloads/search_{search_terms}/*{artist_id}_*.{ext}`.
 Logs will be saved to `./logs/{artist_name/search_terms}.txt`.
 Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name/search_terms}.txt`.
 > :warning: Projects already downloaded from an artists-page will be downloaded **again** if they appear in a search term, and vica versa. Artist- and search queries do NOT share download indices!
 ### Configure what to download
 Simply adjust [`to-grab.yaml`](https://github.com/Leonetienne/Artstation-grabber/blob/master/to-grab.yaml) to your needs. Here is an example:
 ```
 --- 
 artists: 
  - mixppl
  - shumolly
 searches:
 -
  terms: Game of Thrones
  max: 3
 -
  terms: Pirates
  max: 3
 -
  terms: robby rotton
 ```
 The last search term, 'robby rotton' shows that you can also omit `max`. If you do not want to fetch artists, or searches, at all, just delete that yaml-array entirely.
 Files will be saved to `./downloads/{artist_name}/*.{ext}`.
 Logs will be saved to `./logs/{artist_name}.txt`.
 Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name}.txt`.
 ## A word on power usage
 Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should!
--- a/to-grab.yaml
+++ b/to-grab.yaml
@@ -0,0 +1,16 @@
 --- 
 artists: 
  - mixppl
  - shumolly
 searches:
 -
  terms: Game of Thrones
  max: 3
 -
  terms: Pirates
  max: 3
 -
  terms: robby rotton
--- a/util.py
+++ b/util.py
@@ -3,6 +3,9 @@ from datetime import datetime
 import time
 from pathlib import Path
 import re
 import os
 import urllib.request
 from headers import *
 class bcolors:
    HEADER = '\033[95m'
@@ -92,3 +95,34 @@ def slugify(value, allow_unicode=False):
 def getCurrentTimestamp():
    return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
 def isPostAlreadySaved(post_id, artist_name):
    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
    # Does the index file even exist yet?
    if not os.path.exists(idset_filename):
        return False
    # Open the index file
    index_file = open(idset_filename, "r") # Open existing or create
    # Store lines in array
    already_downloaded_post_ids = index_file.readlines()
    return (post_id + "\n") in already_downloaded_post_ids
 def markPostAsSaved(post_id, artist_name):
    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
    # Open the index file
    index_file = open(idset_filename, "a") # Open existing or create
    index_file.write(post_id + "\n")
    index_file.close()
 def downloadMedia(url, filename):
    # Prepare and execute query to download images
    opener = urllib.request.build_opener()
    opener.addheaders = image_request_headers
    urllib.request.install_opener(opener)
    source = urllib.request.urlretrieve(url, filename)