Added functionality to also grab search terms

This commit is contained in:
Leonetienne 2022-01-30 21:02:18 +01:00
parent b37105afa9
commit 8daf6b5245
7 changed files with 240 additions and 53 deletions

View File

@ -1,2 +0,0 @@
- mixppl
- shumolly

View File

@ -1,13 +1,26 @@
import yaml import yaml
import os import os
with open("artists.yaml", "r") as yamlfile: with open("to-grab.yaml", "r") as yamlfile:
try: try:
config = yaml.safe_load(yamlfile) config = yaml.safe_load(yamlfile)
for artist in config: # Grab artists
print(f"\033[92mGrabbing artist '{artist}'") if "artists" in config:
os.system(f"python3 grab.py '{artist}'") for artist in config["artists"]:
print(f"\033[92mGrabbing artist '{artist}'")
os.system(f"python3 grab-artist.py '{artist}'")
# Grab search results
if "searches" in config:
for search in config["searches"]:
print(f"\033[92mGrabbing search results for '{search['terms']}'")
max_results = ""
if "max" in search:
max_results = search["max"]
os.system("python3 grab-search.py '" + search['terms'] + "' " + str(max_results))
except yaml.YAMLError as exc: except yaml.YAMLError as exc:
print("You fucked up the yaml format.") print("You fucked up the yaml format.")

View File

@ -1,8 +1,6 @@
import requests import requests
import mimetypes import mimetypes
import os
import sys import sys
import urllib.request
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import time import time
@ -13,38 +11,7 @@ from headers import *
# SYNPOSIS: # SYNPOSIS:
# To download posts from an artist: # To download posts from an artist:
# python3 grab.py mixppl # python3 grab-artist.py mixppl
def isPostAlreadySaved(post_id):
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
# Does the index file even exist yet?
if not os.path.exists(idset_filename):
return False
# Open the index file
index_file = open(idset_filename, "r") # Open existing or create
# Store lines in array
already_downloaded_post_ids = index_file.readlines()
return (post_id + "\n") in already_downloaded_post_ids
def markPostAsSaved(post_id):
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
# Open the index file
index_file = open(idset_filename, "a") # Open existing or create
index_file.write(post_id + "\n")
index_file.close()
def downloadMedia(url, filename):
# Prepare and execute query to download images
opener = urllib.request.build_opener()
opener.addheaders = image_request_headers
urllib.request.install_opener(opener)
source = urllib.request.urlretrieve(asset_image_url, filename)
# 2 minute timeout in case something gets stuck. # 2 minute timeout in case something gets stuck.
socket.setdefaulttimeout(120) socket.setdefaulttimeout(120)
@ -90,7 +57,7 @@ try:
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name) logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
# Have we already downloaded this post? # Have we already downloaded this post?
if not isPostAlreadySaved(project_hash_id): if not isPostAlreadySaved(project_hash_id, artist_name):
# Fetch information about the project # Fetch information about the project
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers) project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
@ -116,7 +83,7 @@ try:
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name) logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
# After downloading all assets, mark the project as downloaded. # After downloading all assets, mark the project as downloaded.
markPostAsSaved(project_hash_id) markPostAsSaved(project_hash_id, artist_name)
# Project is already downloaded # Project is already downloaded
else: else:
@ -124,7 +91,7 @@ try:
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name) logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
except socket.timeout: except socket.timeout as exc:
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name) logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
except: except BaseException as exc:
logMsg("Failed for some reason!", "err", artist_name) logMsg("Failed for some reason!: " + repr(exc), "err", artist_name)

121
grab-search.py Normal file
View File

@ -0,0 +1,121 @@
import requests
import mimetypes
import sys
from pathlib import Path
from datetime import datetime
import time
import socket
from util import *
from headers import *
# SYNPOSIS:
# To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call
# python3 grab-search.py "game of thrones" 100
# If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example).
# 2 minute timeout in case something gets stuck.
socket.setdefaulttimeout(120)
search_terms = str.lower(sys.argv[1])
search_terms_filename = "search_" + slugify(search_terms)
max_projects = sys.maxsize
# Is max-posts specified?
if len(sys.argv) >= 3:
max_projects = int(sys.argv[2])
# Create artist directory if it doesn't exist
artist_directory = "./downloads/" + search_terms_filename + "/"
Path(artist_directory).mkdir(parents=True, exist_ok=True)
# Create directory for already saved posts, and generate filename
Path("./already_saved/").mkdir(parents=True, exist_ok=True)
# Create directory for logging, and generate filename
Path("./logs/").mkdir(parents=True, exist_ok=True)
if max_projects == sys.maxsize:
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename)
else:
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename)
# Request project info for artist
lastPageReached = False
pageCounter = 1
projectCounter = 0
try:
while not lastPageReached:
logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename)
projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers)
projects = projects_data.json()["data"]
result_size = projects_data.json()["total_count"]
page_num_projects = len(projects)
lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page
if not lastPageReached:
pageCounter = pageCounter + 1
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename)
else:
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename)
# For each project in all of the artists projects
for project in projects:
if projectCounter >= max_projects:
logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename)
exit(0)
project_name = project["title"]
project_hash_id = project["hash_id"]
project_artist_name = project["user"]["username"]
project_artist_name_fullname = project["user"]["full_name"]
logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename)
# Have we already downloaded this post?
if not isPostAlreadySaved(project_hash_id, search_terms_filename):
# Fetch information about the project
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
assets = project_info.json()["assets"]
# For each asset in the project (might be multiple images)
for asset in assets:
asset_type = asset["asset_type"]
# If the asset is an image
if asset_type == "image":
asset_image_url = asset["image_url"]
asset_position = asset["position"]
# Generate a download filename
filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename)
# Download it
downloadMedia(asset_image_url, filename)
else:
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename)
# After downloading all assets, mark the project as downloaded.
markPostAsSaved(project_hash_id, search_terms_filename)
projectCounter = projectCounter + 1
# Project is already downloaded
else:
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename)
logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename)
except socket.timeout as exc:
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename)
except SystemExit:
# That's... why i'm here
exit(0)
except BaseException as exc:
logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename)

View File

@ -3,6 +3,9 @@ This is a personal and untested tool to keep (and update) local copies of artsta
Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance. Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance.
No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM. No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM.
Also supports to download search results.
Currently only working for images. Feel free to issue a pull request if you want more. Currently only working for images. Feel free to issue a pull request if you want more.
## Setup ## Setup
@ -13,22 +16,57 @@ pip3 install requests pyyaml
All scripts require Python3. Tested using 3.9.9. All scripts require Python3. Tested using 3.9.9.
## Running it ## Running it
Here we have two scripts: Here we have three scripts:
### Grab an artists profile
```bash ```bash
grab.py $artist-name grab-artist.py 'mixppl'
``` ```
This will grab one individual profile. This will grab one individual profile, in this case the user 'mixppl'. You must use the username in profiles url! Not the full name!
--- ### Grab search results
```bash
grab-search.py 'Game of Thrones' 100
```
This will grab the first 100 results of the search term 'Game of Thrones'.
If you omit the result limit, **ALL** results will be downloaded! That could be useful, if your search query is very niche. But if you omit it for a popular search term, like 'Game of Thrones', you're in for a ride,
as all approx 12000 projects will be queued for download.
### Automate it
### Invoke a scan
```bash ```bash
grab-all.py grab-all.py
``` ```
This will call `grab.py` on all artists listed in `artists.yaml`. This will call `grab.py` on all artists and search terms listed in `to-grab.yaml`.
Files will be saved to `./downloads/{artist_name}/*.{ext}` and `/downloads/search_{search_terms}/*{artist_id}_*.{ext}`.
Logs will be saved to `./logs/{artist_name/search_terms}.txt`.
Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name/search_terms}.txt`.
> :warning: Projects already downloaded from an artists-page will be downloaded **again** if they appear in a search term, and vica versa. Artist- and search queries do NOT share download indices!
### Configure what to download
Simply adjust [`to-grab.yaml`](https://github.com/Leonetienne/Artstation-grabber/blob/master/to-grab.yaml) to your needs. Here is an example:
```
---
artists:
- mixppl
- shumolly
searches:
-
terms: Game of Thrones
max: 3
-
terms: Pirates
max: 3
-
terms: robby rotton
```
The last search term, 'robby rotton' shows that you can also omit `max`. If you do not want to fetch artists, or searches, at all, just delete that yaml-array entirely.
Files will be saved to `./downloads/{artist_name}/*.{ext}`.
Logs will be saved to `./logs/{artist_name}.txt`.
Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name}.txt`.
## A word on power usage ## A word on power usage
Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should! Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should!

16
to-grab.yaml Normal file
View File

@ -0,0 +1,16 @@
---
artists:
- mixppl
- shumolly
searches:
-
terms: Game of Thrones
max: 3
-
terms: Pirates
max: 3
-
terms: robby rotton

34
util.py
View File

@ -3,6 +3,9 @@ from datetime import datetime
import time import time
from pathlib import Path from pathlib import Path
import re import re
import os
import urllib.request
from headers import *
class bcolors: class bcolors:
HEADER = '\033[95m' HEADER = '\033[95m'
@ -92,3 +95,34 @@ def slugify(value, allow_unicode=False):
def getCurrentTimestamp(): def getCurrentTimestamp():
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M") return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
def isPostAlreadySaved(post_id, artist_name):
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
# Does the index file even exist yet?
if not os.path.exists(idset_filename):
return False
# Open the index file
index_file = open(idset_filename, "r") # Open existing or create
# Store lines in array
already_downloaded_post_ids = index_file.readlines()
return (post_id + "\n") in already_downloaded_post_ids
def markPostAsSaved(post_id, artist_name):
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
# Open the index file
index_file = open(idset_filename, "a") # Open existing or create
index_file.write(post_id + "\n")
index_file.close()
def downloadMedia(url, filename):
# Prepare and execute query to download images
opener = urllib.request.build_opener()
opener.addheaders = image_request_headers
urllib.request.install_opener(opener)
source = urllib.request.urlretrieve(url, filename)