Added functionality to also grab search terms
This commit is contained in:
parent
b37105afa9
commit
8daf6b5245
@ -1,2 +0,0 @@
|
||||
- mixppl
|
||||
- shumolly
|
21
grab-all.py
21
grab-all.py
@ -1,13 +1,26 @@
|
||||
import yaml
|
||||
import os
|
||||
|
||||
with open("artists.yaml", "r") as yamlfile:
|
||||
with open("to-grab.yaml", "r") as yamlfile:
|
||||
try:
|
||||
config = yaml.safe_load(yamlfile)
|
||||
|
||||
for artist in config:
|
||||
print(f"\033[92mGrabbing artist '{artist}'")
|
||||
os.system(f"python3 grab.py '{artist}'")
|
||||
# Grab artists
|
||||
if "artists" in config:
|
||||
for artist in config["artists"]:
|
||||
print(f"\033[92mGrabbing artist '{artist}'")
|
||||
os.system(f"python3 grab-artist.py '{artist}'")
|
||||
|
||||
# Grab search results
|
||||
if "searches" in config:
|
||||
for search in config["searches"]:
|
||||
print(f"\033[92mGrabbing search results for '{search['terms']}'")
|
||||
|
||||
max_results = ""
|
||||
if "max" in search:
|
||||
max_results = search["max"]
|
||||
|
||||
os.system("python3 grab-search.py '" + search['terms'] + "' " + str(max_results))
|
||||
|
||||
except yaml.YAMLError as exc:
|
||||
print("You fucked up the yaml format.")
|
||||
|
@ -1,8 +1,6 @@
|
||||
import requests
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import time
|
||||
@ -13,38 +11,7 @@ from headers import *
|
||||
|
||||
# SYNPOSIS:
|
||||
# To download posts from an artist:
|
||||
# python3 grab.py mixppl
|
||||
|
||||
def isPostAlreadySaved(post_id):
|
||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||
|
||||
# Does the index file even exist yet?
|
||||
if not os.path.exists(idset_filename):
|
||||
return False
|
||||
|
||||
# Open the index file
|
||||
index_file = open(idset_filename, "r") # Open existing or create
|
||||
|
||||
# Store lines in array
|
||||
already_downloaded_post_ids = index_file.readlines()
|
||||
|
||||
return (post_id + "\n") in already_downloaded_post_ids
|
||||
|
||||
def markPostAsSaved(post_id):
|
||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||
|
||||
# Open the index file
|
||||
index_file = open(idset_filename, "a") # Open existing or create
|
||||
index_file.write(post_id + "\n")
|
||||
index_file.close()
|
||||
|
||||
|
||||
def downloadMedia(url, filename):
|
||||
# Prepare and execute query to download images
|
||||
opener = urllib.request.build_opener()
|
||||
opener.addheaders = image_request_headers
|
||||
urllib.request.install_opener(opener)
|
||||
source = urllib.request.urlretrieve(asset_image_url, filename)
|
||||
# python3 grab-artist.py mixppl
|
||||
|
||||
# 2 minute timeout in case something gets stuck.
|
||||
socket.setdefaulttimeout(120)
|
||||
@ -90,7 +57,7 @@ try:
|
||||
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
|
||||
|
||||
# Have we already downloaded this post?
|
||||
if not isPostAlreadySaved(project_hash_id):
|
||||
if not isPostAlreadySaved(project_hash_id, artist_name):
|
||||
|
||||
# Fetch information about the project
|
||||
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
|
||||
@ -116,7 +83,7 @@ try:
|
||||
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
|
||||
|
||||
# After downloading all assets, mark the project as downloaded.
|
||||
markPostAsSaved(project_hash_id)
|
||||
markPostAsSaved(project_hash_id, artist_name)
|
||||
|
||||
# Project is already downloaded
|
||||
else:
|
||||
@ -124,7 +91,7 @@ try:
|
||||
|
||||
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
|
||||
|
||||
except socket.timeout:
|
||||
except socket.timeout as exc:
|
||||
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
|
||||
except:
|
||||
logMsg("Failed for some reason!", "err", artist_name)
|
||||
except BaseException as exc:
|
||||
logMsg("Failed for some reason!: " + repr(exc), "err", artist_name)
|
121
grab-search.py
Normal file
121
grab-search.py
Normal file
@ -0,0 +1,121 @@
|
||||
import requests
|
||||
import mimetypes
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import time
|
||||
import socket
|
||||
|
||||
from util import *
|
||||
from headers import *
|
||||
|
||||
# SYNPOSIS:
|
||||
# To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call
|
||||
# python3 grab-search.py "game of thrones" 100
|
||||
# If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example).
|
||||
|
||||
# 2 minute timeout in case something gets stuck.
|
||||
socket.setdefaulttimeout(120)
|
||||
|
||||
search_terms = str.lower(sys.argv[1])
|
||||
search_terms_filename = "search_" + slugify(search_terms)
|
||||
|
||||
max_projects = sys.maxsize
|
||||
# Is max-posts specified?
|
||||
if len(sys.argv) >= 3:
|
||||
max_projects = int(sys.argv[2])
|
||||
|
||||
# Create artist directory if it doesn't exist
|
||||
artist_directory = "./downloads/" + search_terms_filename + "/"
|
||||
Path(artist_directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create directory for already saved posts, and generate filename
|
||||
Path("./already_saved/").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create directory for logging, and generate filename
|
||||
Path("./logs/").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if max_projects == sys.maxsize:
|
||||
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename)
|
||||
else:
|
||||
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename)
|
||||
|
||||
# Request project info for artist
|
||||
lastPageReached = False
|
||||
pageCounter = 1
|
||||
projectCounter = 0
|
||||
try:
|
||||
while not lastPageReached:
|
||||
logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename)
|
||||
projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers)
|
||||
projects = projects_data.json()["data"]
|
||||
result_size = projects_data.json()["total_count"]
|
||||
page_num_projects = len(projects)
|
||||
|
||||
|
||||
|
||||
lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page
|
||||
|
||||
if not lastPageReached:
|
||||
pageCounter = pageCounter + 1
|
||||
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename)
|
||||
else:
|
||||
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename)
|
||||
|
||||
|
||||
# For each project in all of the artists projects
|
||||
for project in projects:
|
||||
if projectCounter >= max_projects:
|
||||
logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename)
|
||||
exit(0)
|
||||
|
||||
project_name = project["title"]
|
||||
project_hash_id = project["hash_id"]
|
||||
project_artist_name = project["user"]["username"]
|
||||
project_artist_name_fullname = project["user"]["full_name"]
|
||||
|
||||
logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename)
|
||||
|
||||
# Have we already downloaded this post?
|
||||
if not isPostAlreadySaved(project_hash_id, search_terms_filename):
|
||||
|
||||
# Fetch information about the project
|
||||
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
|
||||
assets = project_info.json()["assets"]
|
||||
|
||||
# For each asset in the project (might be multiple images)
|
||||
for asset in assets:
|
||||
asset_type = asset["asset_type"]
|
||||
|
||||
# If the asset is an image
|
||||
if asset_type == "image":
|
||||
asset_image_url = asset["image_url"]
|
||||
asset_position = asset["position"]
|
||||
|
||||
# Generate a download filename
|
||||
filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
|
||||
|
||||
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename)
|
||||
|
||||
# Download it
|
||||
downloadMedia(asset_image_url, filename)
|
||||
else:
|
||||
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename)
|
||||
|
||||
# After downloading all assets, mark the project as downloaded.
|
||||
markPostAsSaved(project_hash_id, search_terms_filename)
|
||||
projectCounter = projectCounter + 1
|
||||
|
||||
# Project is already downloaded
|
||||
else:
|
||||
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename)
|
||||
|
||||
logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename)
|
||||
|
||||
except socket.timeout as exc:
|
||||
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename)
|
||||
except SystemExit:
|
||||
# That's... why i'm here
|
||||
exit(0)
|
||||
except BaseException as exc:
|
||||
logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename)
|
54
readme.md
54
readme.md
@ -3,6 +3,9 @@ This is a personal and untested tool to keep (and update) local copies of artsta
|
||||
Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance.
|
||||
No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM.
|
||||
|
||||
Also supports to download search results.
|
||||
|
||||
|
||||
Currently only working for images. Feel free to issue a pull request if you want more.
|
||||
|
||||
## Setup
|
||||
@ -13,22 +16,57 @@ pip3 install requests pyyaml
|
||||
All scripts require Python3. Tested using 3.9.9.
|
||||
|
||||
## Running it
|
||||
Here we have two scripts:
|
||||
Here we have three scripts:
|
||||
|
||||
### Grab an artists profile
|
||||
```bash
|
||||
grab.py $artist-name
|
||||
grab-artist.py 'mixppl'
|
||||
```
|
||||
This will grab one individual profile.
|
||||
This will grab one individual profile, in this case the user 'mixppl'. You must use the username in profiles url! Not the full name!
|
||||
|
||||
---
|
||||
### Grab search results
|
||||
```bash
|
||||
grab-search.py 'Game of Thrones' 100
|
||||
```
|
||||
This will grab the first 100 results of the search term 'Game of Thrones'.
|
||||
If you omit the result limit, **ALL** results will be downloaded! That could be useful, if your search query is very niche. But if you omit it for a popular search term, like 'Game of Thrones', you're in for a ride,
|
||||
as all approx 12000 projects will be queued for download.
|
||||
|
||||
### Automate it
|
||||
### Invoke a scan
|
||||
```bash
|
||||
grab-all.py
|
||||
```
|
||||
This will call `grab.py` on all artists listed in `artists.yaml`.
|
||||
This will call `grab.py` on all artists and search terms listed in `to-grab.yaml`.
|
||||
|
||||
Files will be saved to `./downloads/{artist_name}/*.{ext}` and `/downloads/search_{search_terms}/*{artist_id}_*.{ext}`.
|
||||
Logs will be saved to `./logs/{artist_name/search_terms}.txt`.
|
||||
Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name/search_terms}.txt`.
|
||||
|
||||
> :warning: Projects already downloaded from an artists-page will be downloaded **again** if they appear in a search term, and vica versa. Artist- and search queries do NOT share download indices!
|
||||
|
||||
### Configure what to download
|
||||
Simply adjust [`to-grab.yaml`](https://github.com/Leonetienne/Artstation-grabber/blob/master/to-grab.yaml) to your needs. Here is an example:
|
||||
```
|
||||
---
|
||||
artists:
|
||||
- mixppl
|
||||
- shumolly
|
||||
|
||||
searches:
|
||||
-
|
||||
terms: Game of Thrones
|
||||
max: 3
|
||||
|
||||
-
|
||||
terms: Pirates
|
||||
max: 3
|
||||
|
||||
-
|
||||
terms: robby rotton
|
||||
```
|
||||
The last search term, 'robby rotton' shows that you can also omit `max`. If you do not want to fetch artists, or searches, at all, just delete that yaml-array entirely.
|
||||
|
||||
Files will be saved to `./downloads/{artist_name}/*.{ext}`.
|
||||
Logs will be saved to `./logs/{artist_name}.txt`.
|
||||
Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name}.txt`.
|
||||
|
||||
## A word on power usage
|
||||
Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should!
|
||||
|
16
to-grab.yaml
Normal file
16
to-grab.yaml
Normal file
@ -0,0 +1,16 @@
|
||||
---
|
||||
artists:
|
||||
- mixppl
|
||||
- shumolly
|
||||
|
||||
searches:
|
||||
-
|
||||
terms: Game of Thrones
|
||||
max: 3
|
||||
|
||||
-
|
||||
terms: Pirates
|
||||
max: 3
|
||||
|
||||
-
|
||||
terms: robby rotton
|
34
util.py
34
util.py
@ -3,6 +3,9 @@ from datetime import datetime
|
||||
import time
|
||||
from pathlib import Path
|
||||
import re
|
||||
import os
|
||||
import urllib.request
|
||||
from headers import *
|
||||
|
||||
class bcolors:
|
||||
HEADER = '\033[95m'
|
||||
@ -92,3 +95,34 @@ def slugify(value, allow_unicode=False):
|
||||
|
||||
def getCurrentTimestamp():
|
||||
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
|
||||
|
||||
def isPostAlreadySaved(post_id, artist_name):
|
||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||
|
||||
# Does the index file even exist yet?
|
||||
if not os.path.exists(idset_filename):
|
||||
return False
|
||||
|
||||
# Open the index file
|
||||
index_file = open(idset_filename, "r") # Open existing or create
|
||||
|
||||
# Store lines in array
|
||||
already_downloaded_post_ids = index_file.readlines()
|
||||
|
||||
return (post_id + "\n") in already_downloaded_post_ids
|
||||
|
||||
def markPostAsSaved(post_id, artist_name):
|
||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||
|
||||
# Open the index file
|
||||
index_file = open(idset_filename, "a") # Open existing or create
|
||||
index_file.write(post_id + "\n")
|
||||
index_file.close()
|
||||
|
||||
|
||||
def downloadMedia(url, filename):
|
||||
# Prepare and execute query to download images
|
||||
opener = urllib.request.build_opener()
|
||||
opener.addheaders = image_request_headers
|
||||
urllib.request.install_opener(opener)
|
||||
source = urllib.request.urlretrieve(url, filename)
|
||||
|
Loading…
x
Reference in New Issue
Block a user