Added functionality to also grab search terms

2022-01-30 21:02:18 +01:00 · 2022-01-30 21:02:18 +01:00 · 8daf6b5245
commit 8daf6b5245
parent b37105afa9
7 changed files with 240 additions and 53 deletions
--- a/artists.yaml
+++ b/artists.yaml
@ -1,2 +0,0 @@
- mixppl
- shumolly
--- a/grab-all.py
+++ b/grab-all.py
@ -1,13 +1,26 @@
 import yaml
 import os

-with open("artists.yaml", "r") as yamlfile:
+with open("to-grab.yaml", "r") as yamlfile:
    try:
        config = yaml.safe_load(yamlfile)

-        for artist in config:
-            print(f"\033[92mGrabbing artist '{artist}'")
-            os.system(f"python3 grab.py '{artist}'")
+        # Grab artists
+        if "artists" in config:
+            for artist in config["artists"]:
+                print(f"\033[92mGrabbing artist '{artist}'")
+                os.system(f"python3 grab-artist.py '{artist}'")
+        
+        # Grab search results
+        if "searches" in config:
+            for search in config["searches"]:
+                print(f"\033[92mGrabbing search results for '{search['terms']}'")
+
+                max_results = ""
+                if "max" in search:
+                    max_results = search["max"]
+
+                os.system("python3 grab-search.py '" + search['terms'] + "' " + str(max_results))

    except yaml.YAMLError as exc:
        print("You fucked up the yaml format.")
--- a/grab-artist.py
+++ b/grab-artist.py
@ -1,8 +1,6 @@
 import requests
 import mimetypes
-import os
 import sys
-import urllib.request
 from pathlib import Path
 from datetime import datetime
 import time
@ -13,38 +11,7 @@ from headers import *

 # SYNPOSIS:
 # To download posts from an artist:
-# python3 grab.py mixppl
-
-def isPostAlreadySaved(post_id):
-    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
-
-    # Does the index file even exist yet?
-    if not os.path.exists(idset_filename):
-        return False
-
-    # Open the index file
-    index_file = open(idset_filename, "r") # Open existing or create
-
-    # Store lines in array
-    already_downloaded_post_ids = index_file.readlines()
-
-    return (post_id + "\n") in already_downloaded_post_ids
-
-def markPostAsSaved(post_id):
-    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
-
-    # Open the index file
-    index_file = open(idset_filename, "a") # Open existing or create
-    index_file.write(post_id + "\n")
-    index_file.close()
-
-
-def downloadMedia(url, filename):
-    # Prepare and execute query to download images
-    opener = urllib.request.build_opener()
-    opener.addheaders = image_request_headers
-    urllib.request.install_opener(opener)
-    source = urllib.request.urlretrieve(asset_image_url, filename)
+# python3 grab-artist.py mixppl

 # 2 minute timeout in case something gets stuck.
 socket.setdefaulttimeout(120)
@ -90,7 +57,7 @@ try:
            logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)

            # Have we already downloaded this post?
-            if not isPostAlreadySaved(project_hash_id):
+            if not isPostAlreadySaved(project_hash_id, artist_name):

                # Fetch information about the project
                project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
@ -116,7 +83,7 @@ try:
                        logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)

                # After downloading all assets, mark the project as downloaded.
-                markPostAsSaved(project_hash_id)
+                markPostAsSaved(project_hash_id, artist_name)

            # Project is already downloaded
            else:
@ -124,7 +91,7 @@ try:

    logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)

-except socket.timeout:
+except socket.timeout as exc:
    logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
-except:
-    logMsg("Failed for some reason!", "err", artist_name)
+except BaseException as exc:
+    logMsg("Failed for some reason!: " + repr(exc), "err", artist_name)
--- a/grab-search.py
+++ b/grab-search.py
@ -0,0 +1,121 @@
+import requests
+import mimetypes
+import sys
+from pathlib import Path
+from datetime import datetime
+import time
+import socket
+
+from util import *
+from headers import *
+
+# SYNPOSIS:
+# To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call
+# python3 grab-search.py "game of thrones" 100
+# If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example).
+
+# 2 minute timeout in case something gets stuck.
+socket.setdefaulttimeout(120)
+
+search_terms = str.lower(sys.argv[1])
+search_terms_filename = "search_" + slugify(search_terms)
+
+max_projects = sys.maxsize
+# Is max-posts specified?
+if len(sys.argv) >= 3:
+    max_projects = int(sys.argv[2])
+
+# Create artist directory if it doesn't exist
+artist_directory = "./downloads/" + search_terms_filename + "/"
+Path(artist_directory).mkdir(parents=True, exist_ok=True)
+
+# Create directory for already saved posts, and generate filename
+Path("./already_saved/").mkdir(parents=True, exist_ok=True)
+
+# Create directory for logging, and generate filename
+Path("./logs/").mkdir(parents=True, exist_ok=True)
+
+if max_projects == sys.maxsize:
+    logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename)
+else:
+    logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename)
+
+# Request project info for artist
+lastPageReached = False
+pageCounter = 1
+projectCounter = 0
+try:
+    while not lastPageReached:
+        logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename)
+        projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers)
+        projects = projects_data.json()["data"]
+        result_size = projects_data.json()["total_count"]
+        page_num_projects = len(projects)
+
+
+
+        lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page
+
+        if not lastPageReached:
+            pageCounter = pageCounter + 1
+            logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename)
+        else:
+            logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename)
+
+
+        # For each project in all of the artists projects
+        for project in projects:
+            if projectCounter >= max_projects:
+                logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename)
+                exit(0)
+
+            project_name        = project["title"]
+            project_hash_id     = project["hash_id"]
+            project_artist_name = project["user"]["username"]
+            project_artist_name_fullname = project["user"]["full_name"]
+
+            logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename)
+
+            # Have we already downloaded this post?
+            if not isPostAlreadySaved(project_hash_id, search_terms_filename):
+
+                # Fetch information about the project
+                project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
+                assets = project_info.json()["assets"]
+
+                # For each asset in the project (might be multiple images)
+                for asset in assets:
+                    asset_type = asset["asset_type"]
+
+                    # If the asset is an image
+                    if asset_type == "image":
+                        asset_image_url = asset["image_url"]
+                        asset_position = asset["position"]
+                        
+                        # Generate a download filename
+                        filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
+
+                        logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename)
+
+                        # Download it
+                        downloadMedia(asset_image_url, filename)
+                    else:
+                        logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename)
+
+                # After downloading all assets, mark the project as downloaded.
+                markPostAsSaved(project_hash_id, search_terms_filename)
+                projectCounter = projectCounter + 1
+
+            # Project is already downloaded
+            else:
+                logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename)
+
+    logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename)
+
+except socket.timeout as exc:
+    logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename)
+except SystemExit:
+    # That's... why i'm here
+    exit(0)
+except BaseException as exc:
+    logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename)
--- a/readme.md
+++ b/readme.md
@ -3,6 +3,9 @@ This is a personal and untested tool to keep (and update) local copies of artsta
 Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance.
 No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM.

+Also supports to download search results.
+
+
 Currently only working for images. Feel free to issue a pull request if you want more.

 ## Setup
@ -13,22 +16,57 @@ pip3 install requests pyyaml
 All scripts require Python3. Tested using 3.9.9.

 ## Running it
-Here we have two scripts:
+Here we have three scripts:
+
+### Grab an artists profile
 ```bash
-grab.py $artist-name
+grab-artist.py 'mixppl'
 ```
-This will grab one individual profile.
+This will grab one individual profile, in this case the user 'mixppl'. You must use the username in profiles url! Not the full name!

---
+### Grab search results
+```bash
+grab-search.py 'Game of Thrones' 100
+```
+This will grab the first 100 results of the search term 'Game of Thrones'.
+If you omit the result limit, **ALL** results will be downloaded! That could be useful, if your search query is very niche. But if you omit it for a popular search term, like 'Game of Thrones', you're in for a ride,
+as all approx 12000 projects will be queued for download.

+### Automate it
+### Invoke a scan
 ```bash
 grab-all.py
 ```
-This will call `grab.py` on all artists listed in `artists.yaml`.
+This will call `grab.py` on all artists and search terms listed in `to-grab.yaml`.
+
+Files will be saved to `./downloads/{artist_name}/*.{ext}` and `/downloads/search_{search_terms}/*{artist_id}_*.{ext}`.
+Logs will be saved to `./logs/{artist_name/search_terms}.txt`.
+Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name/search_terms}.txt`.
+
+> :warning: Projects already downloaded from an artists-page will be downloaded **again** if they appear in a search term, and vica versa. Artist- and search queries do NOT share download indices!
+
+### Configure what to download
+Simply adjust [`to-grab.yaml`](https://github.com/Leonetienne/Artstation-grabber/blob/master/to-grab.yaml) to your needs. Here is an example:
+```
+--- 
+artists: 
+  - mixppl
+  - shumolly
+
+searches:
+ -
+  terms: Game of Thrones
+  max: 3
+
+ -
+  terms: Pirates
+  max: 3
+
+ -
+  terms: robby rotton
+```
+The last search term, 'robby rotton' shows that you can also omit `max`. If you do not want to fetch artists, or searches, at all, just delete that yaml-array entirely.

-Files will be saved to `./downloads/{artist_name}/*.{ext}`.
-Logs will be saved to `./logs/{artist_name}.txt`.
-Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name}.txt`.

 ## A word on power usage
 Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should!
--- a/to-grab.yaml
+++ b/to-grab.yaml
@ -0,0 +1,16 @@
+--- 
+artists: 
+  - mixppl
+  - shumolly
+
+searches:
+ -
+  terms: Game of Thrones
+  max: 3
+
+ -
+  terms: Pirates
+  max: 3
+
+ -
+  terms: robby rotton
--- a/util.py
+++ b/util.py
@ -3,6 +3,9 @@ from datetime import datetime
 import time
 from pathlib import Path
 import re
+import os
+import urllib.request
+from headers import *

 class bcolors:
    HEADER = '\033[95m'
@ -92,3 +95,34 @@ def slugify(value, allow_unicode=False):

 def getCurrentTimestamp():
    return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
+
+def isPostAlreadySaved(post_id, artist_name):
+    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
+
+    # Does the index file even exist yet?
+    if not os.path.exists(idset_filename):
+        return False
+
+    # Open the index file
+    index_file = open(idset_filename, "r") # Open existing or create
+
+    # Store lines in array
+    already_downloaded_post_ids = index_file.readlines()
+
+    return (post_id + "\n") in already_downloaded_post_ids
+
+def markPostAsSaved(post_id, artist_name):
+    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
+
+    # Open the index file
+    index_file = open(idset_filename, "a") # Open existing or create
+    index_file.write(post_id + "\n")
+    index_file.close()
+
+
+def downloadMedia(url, filename):
+    # Prepare and execute query to download images
+    opener = urllib.request.build_opener()
+    opener.addheaders = image_request_headers
+    urllib.request.install_opener(opener)
+    source = urllib.request.urlretrieve(url, filename)