Added functionality to also grab search terms
This commit is contained in:
parent
b37105afa9
commit
8daf6b5245
@ -1,2 +0,0 @@
|
|||||||
- mixppl
|
|
||||||
- shumolly
|
|
21
grab-all.py
21
grab-all.py
@ -1,13 +1,26 @@
|
|||||||
import yaml
|
import yaml
|
||||||
import os
|
import os
|
||||||
|
|
||||||
with open("artists.yaml", "r") as yamlfile:
|
with open("to-grab.yaml", "r") as yamlfile:
|
||||||
try:
|
try:
|
||||||
config = yaml.safe_load(yamlfile)
|
config = yaml.safe_load(yamlfile)
|
||||||
|
|
||||||
for artist in config:
|
# Grab artists
|
||||||
print(f"\033[92mGrabbing artist '{artist}'")
|
if "artists" in config:
|
||||||
os.system(f"python3 grab.py '{artist}'")
|
for artist in config["artists"]:
|
||||||
|
print(f"\033[92mGrabbing artist '{artist}'")
|
||||||
|
os.system(f"python3 grab-artist.py '{artist}'")
|
||||||
|
|
||||||
|
# Grab search results
|
||||||
|
if "searches" in config:
|
||||||
|
for search in config["searches"]:
|
||||||
|
print(f"\033[92mGrabbing search results for '{search['terms']}'")
|
||||||
|
|
||||||
|
max_results = ""
|
||||||
|
if "max" in search:
|
||||||
|
max_results = search["max"]
|
||||||
|
|
||||||
|
os.system("python3 grab-search.py '" + search['terms'] + "' " + str(max_results))
|
||||||
|
|
||||||
except yaml.YAMLError as exc:
|
except yaml.YAMLError as exc:
|
||||||
print("You fucked up the yaml format.")
|
print("You fucked up the yaml format.")
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
import requests
|
import requests
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import time
|
import time
|
||||||
@ -13,38 +11,7 @@ from headers import *
|
|||||||
|
|
||||||
# SYNPOSIS:
|
# SYNPOSIS:
|
||||||
# To download posts from an artist:
|
# To download posts from an artist:
|
||||||
# python3 grab.py mixppl
|
# python3 grab-artist.py mixppl
|
||||||
|
|
||||||
def isPostAlreadySaved(post_id):
|
|
||||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
|
||||||
|
|
||||||
# Does the index file even exist yet?
|
|
||||||
if not os.path.exists(idset_filename):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Open the index file
|
|
||||||
index_file = open(idset_filename, "r") # Open existing or create
|
|
||||||
|
|
||||||
# Store lines in array
|
|
||||||
already_downloaded_post_ids = index_file.readlines()
|
|
||||||
|
|
||||||
return (post_id + "\n") in already_downloaded_post_ids
|
|
||||||
|
|
||||||
def markPostAsSaved(post_id):
|
|
||||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
|
||||||
|
|
||||||
# Open the index file
|
|
||||||
index_file = open(idset_filename, "a") # Open existing or create
|
|
||||||
index_file.write(post_id + "\n")
|
|
||||||
index_file.close()
|
|
||||||
|
|
||||||
|
|
||||||
def downloadMedia(url, filename):
|
|
||||||
# Prepare and execute query to download images
|
|
||||||
opener = urllib.request.build_opener()
|
|
||||||
opener.addheaders = image_request_headers
|
|
||||||
urllib.request.install_opener(opener)
|
|
||||||
source = urllib.request.urlretrieve(asset_image_url, filename)
|
|
||||||
|
|
||||||
# 2 minute timeout in case something gets stuck.
|
# 2 minute timeout in case something gets stuck.
|
||||||
socket.setdefaulttimeout(120)
|
socket.setdefaulttimeout(120)
|
||||||
@ -90,7 +57,7 @@ try:
|
|||||||
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
|
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
|
||||||
|
|
||||||
# Have we already downloaded this post?
|
# Have we already downloaded this post?
|
||||||
if not isPostAlreadySaved(project_hash_id):
|
if not isPostAlreadySaved(project_hash_id, artist_name):
|
||||||
|
|
||||||
# Fetch information about the project
|
# Fetch information about the project
|
||||||
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
|
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
|
||||||
@ -116,7 +83,7 @@ try:
|
|||||||
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
|
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
|
||||||
|
|
||||||
# After downloading all assets, mark the project as downloaded.
|
# After downloading all assets, mark the project as downloaded.
|
||||||
markPostAsSaved(project_hash_id)
|
markPostAsSaved(project_hash_id, artist_name)
|
||||||
|
|
||||||
# Project is already downloaded
|
# Project is already downloaded
|
||||||
else:
|
else:
|
||||||
@ -124,7 +91,7 @@ try:
|
|||||||
|
|
||||||
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
|
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
|
||||||
|
|
||||||
except socket.timeout:
|
except socket.timeout as exc:
|
||||||
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
|
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
|
||||||
except:
|
except BaseException as exc:
|
||||||
logMsg("Failed for some reason!", "err", artist_name)
|
logMsg("Failed for some reason!: " + repr(exc), "err", artist_name)
|
121
grab-search.py
Normal file
121
grab-search.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
import requests
|
||||||
|
import mimetypes
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import socket
|
||||||
|
|
||||||
|
from util import *
|
||||||
|
from headers import *
|
||||||
|
|
||||||
|
# SYNPOSIS:
|
||||||
|
# To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call
|
||||||
|
# python3 grab-search.py "game of thrones" 100
|
||||||
|
# If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example).
|
||||||
|
|
||||||
|
# 2 minute timeout in case something gets stuck.
|
||||||
|
socket.setdefaulttimeout(120)
|
||||||
|
|
||||||
|
search_terms = str.lower(sys.argv[1])
|
||||||
|
search_terms_filename = "search_" + slugify(search_terms)
|
||||||
|
|
||||||
|
max_projects = sys.maxsize
|
||||||
|
# Is max-posts specified?
|
||||||
|
if len(sys.argv) >= 3:
|
||||||
|
max_projects = int(sys.argv[2])
|
||||||
|
|
||||||
|
# Create artist directory if it doesn't exist
|
||||||
|
artist_directory = "./downloads/" + search_terms_filename + "/"
|
||||||
|
Path(artist_directory).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create directory for already saved posts, and generate filename
|
||||||
|
Path("./already_saved/").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create directory for logging, and generate filename
|
||||||
|
Path("./logs/").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if max_projects == sys.maxsize:
|
||||||
|
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename)
|
||||||
|
else:
|
||||||
|
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename)
|
||||||
|
|
||||||
|
# Request project info for artist
|
||||||
|
lastPageReached = False
|
||||||
|
pageCounter = 1
|
||||||
|
projectCounter = 0
|
||||||
|
try:
|
||||||
|
while not lastPageReached:
|
||||||
|
logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename)
|
||||||
|
projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers)
|
||||||
|
projects = projects_data.json()["data"]
|
||||||
|
result_size = projects_data.json()["total_count"]
|
||||||
|
page_num_projects = len(projects)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page
|
||||||
|
|
||||||
|
if not lastPageReached:
|
||||||
|
pageCounter = pageCounter + 1
|
||||||
|
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename)
|
||||||
|
else:
|
||||||
|
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename)
|
||||||
|
|
||||||
|
|
||||||
|
# For each project in all of the artists projects
|
||||||
|
for project in projects:
|
||||||
|
if projectCounter >= max_projects:
|
||||||
|
logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename)
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
project_name = project["title"]
|
||||||
|
project_hash_id = project["hash_id"]
|
||||||
|
project_artist_name = project["user"]["username"]
|
||||||
|
project_artist_name_fullname = project["user"]["full_name"]
|
||||||
|
|
||||||
|
logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename)
|
||||||
|
|
||||||
|
# Have we already downloaded this post?
|
||||||
|
if not isPostAlreadySaved(project_hash_id, search_terms_filename):
|
||||||
|
|
||||||
|
# Fetch information about the project
|
||||||
|
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
|
||||||
|
assets = project_info.json()["assets"]
|
||||||
|
|
||||||
|
# For each asset in the project (might be multiple images)
|
||||||
|
for asset in assets:
|
||||||
|
asset_type = asset["asset_type"]
|
||||||
|
|
||||||
|
# If the asset is an image
|
||||||
|
if asset_type == "image":
|
||||||
|
asset_image_url = asset["image_url"]
|
||||||
|
asset_position = asset["position"]
|
||||||
|
|
||||||
|
# Generate a download filename
|
||||||
|
filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
|
||||||
|
|
||||||
|
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename)
|
||||||
|
|
||||||
|
# Download it
|
||||||
|
downloadMedia(asset_image_url, filename)
|
||||||
|
else:
|
||||||
|
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename)
|
||||||
|
|
||||||
|
# After downloading all assets, mark the project as downloaded.
|
||||||
|
markPostAsSaved(project_hash_id, search_terms_filename)
|
||||||
|
projectCounter = projectCounter + 1
|
||||||
|
|
||||||
|
# Project is already downloaded
|
||||||
|
else:
|
||||||
|
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename)
|
||||||
|
|
||||||
|
logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename)
|
||||||
|
|
||||||
|
except socket.timeout as exc:
|
||||||
|
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename)
|
||||||
|
except SystemExit:
|
||||||
|
# That's... why i'm here
|
||||||
|
exit(0)
|
||||||
|
except BaseException as exc:
|
||||||
|
logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename)
|
54
readme.md
54
readme.md
@ -3,6 +3,9 @@ This is a personal and untested tool to keep (and update) local copies of artsta
|
|||||||
Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance.
|
Project id's are saved in order to not re-download everything. This ensures that only new media will be downloaded in each cronjob instance.
|
||||||
No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM.
|
No fancy bs going on. Just a simple script bodged together within 10 minutes at 3 AM.
|
||||||
|
|
||||||
|
Also supports to download search results.
|
||||||
|
|
||||||
|
|
||||||
Currently only working for images. Feel free to issue a pull request if you want more.
|
Currently only working for images. Feel free to issue a pull request if you want more.
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
@ -13,22 +16,57 @@ pip3 install requests pyyaml
|
|||||||
All scripts require Python3. Tested using 3.9.9.
|
All scripts require Python3. Tested using 3.9.9.
|
||||||
|
|
||||||
## Running it
|
## Running it
|
||||||
Here we have two scripts:
|
Here we have three scripts:
|
||||||
|
|
||||||
|
### Grab an artists profile
|
||||||
```bash
|
```bash
|
||||||
grab.py $artist-name
|
grab-artist.py 'mixppl'
|
||||||
```
|
```
|
||||||
This will grab one individual profile.
|
This will grab one individual profile, in this case the user 'mixppl'. You must use the username in profiles url! Not the full name!
|
||||||
|
|
||||||
---
|
### Grab search results
|
||||||
|
```bash
|
||||||
|
grab-search.py 'Game of Thrones' 100
|
||||||
|
```
|
||||||
|
This will grab the first 100 results of the search term 'Game of Thrones'.
|
||||||
|
If you omit the result limit, **ALL** results will be downloaded! That could be useful, if your search query is very niche. But if you omit it for a popular search term, like 'Game of Thrones', you're in for a ride,
|
||||||
|
as all approx 12000 projects will be queued for download.
|
||||||
|
|
||||||
|
### Automate it
|
||||||
|
### Invoke a scan
|
||||||
```bash
|
```bash
|
||||||
grab-all.py
|
grab-all.py
|
||||||
```
|
```
|
||||||
This will call `grab.py` on all artists listed in `artists.yaml`.
|
This will call `grab.py` on all artists and search terms listed in `to-grab.yaml`.
|
||||||
|
|
||||||
|
Files will be saved to `./downloads/{artist_name}/*.{ext}` and `/downloads/search_{search_terms}/*{artist_id}_*.{ext}`.
|
||||||
|
Logs will be saved to `./logs/{artist_name/search_terms}.txt`.
|
||||||
|
Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name/search_terms}.txt`.
|
||||||
|
|
||||||
|
> :warning: Projects already downloaded from an artists-page will be downloaded **again** if they appear in a search term, and vica versa. Artist- and search queries do NOT share download indices!
|
||||||
|
|
||||||
|
### Configure what to download
|
||||||
|
Simply adjust [`to-grab.yaml`](https://github.com/Leonetienne/Artstation-grabber/blob/master/to-grab.yaml) to your needs. Here is an example:
|
||||||
|
```
|
||||||
|
---
|
||||||
|
artists:
|
||||||
|
- mixppl
|
||||||
|
- shumolly
|
||||||
|
|
||||||
|
searches:
|
||||||
|
-
|
||||||
|
terms: Game of Thrones
|
||||||
|
max: 3
|
||||||
|
|
||||||
|
-
|
||||||
|
terms: Pirates
|
||||||
|
max: 3
|
||||||
|
|
||||||
|
-
|
||||||
|
terms: robby rotton
|
||||||
|
```
|
||||||
|
The last search term, 'robby rotton' shows that you can also omit `max`. If you do not want to fetch artists, or searches, at all, just delete that yaml-array entirely.
|
||||||
|
|
||||||
Files will be saved to `./downloads/{artist_name}/*.{ext}`.
|
|
||||||
Logs will be saved to `./logs/{artist_name}.txt`.
|
|
||||||
Download indices (to skip already downloaded projects) are kept in `./already_saved/{artist_name}.txt`.
|
|
||||||
|
|
||||||
## A word on power usage
|
## A word on power usage
|
||||||
Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should!
|
Do not overuse this or you might piss of artstations maintainers. Just because you CAN download 400 gigabytes of images per day doesn't mean that you should!
|
||||||
|
16
to-grab.yaml
Normal file
16
to-grab.yaml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
---
|
||||||
|
artists:
|
||||||
|
- mixppl
|
||||||
|
- shumolly
|
||||||
|
|
||||||
|
searches:
|
||||||
|
-
|
||||||
|
terms: Game of Thrones
|
||||||
|
max: 3
|
||||||
|
|
||||||
|
-
|
||||||
|
terms: Pirates
|
||||||
|
max: 3
|
||||||
|
|
||||||
|
-
|
||||||
|
terms: robby rotton
|
34
util.py
34
util.py
@ -3,6 +3,9 @@ from datetime import datetime
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
import urllib.request
|
||||||
|
from headers import *
|
||||||
|
|
||||||
class bcolors:
|
class bcolors:
|
||||||
HEADER = '\033[95m'
|
HEADER = '\033[95m'
|
||||||
@ -92,3 +95,34 @@ def slugify(value, allow_unicode=False):
|
|||||||
|
|
||||||
def getCurrentTimestamp():
|
def getCurrentTimestamp():
|
||||||
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
|
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
|
||||||
|
|
||||||
|
def isPostAlreadySaved(post_id, artist_name):
|
||||||
|
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||||
|
|
||||||
|
# Does the index file even exist yet?
|
||||||
|
if not os.path.exists(idset_filename):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Open the index file
|
||||||
|
index_file = open(idset_filename, "r") # Open existing or create
|
||||||
|
|
||||||
|
# Store lines in array
|
||||||
|
already_downloaded_post_ids = index_file.readlines()
|
||||||
|
|
||||||
|
return (post_id + "\n") in already_downloaded_post_ids
|
||||||
|
|
||||||
|
def markPostAsSaved(post_id, artist_name):
|
||||||
|
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||||
|
|
||||||
|
# Open the index file
|
||||||
|
index_file = open(idset_filename, "a") # Open existing or create
|
||||||
|
index_file.write(post_id + "\n")
|
||||||
|
index_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
def downloadMedia(url, filename):
|
||||||
|
# Prepare and execute query to download images
|
||||||
|
opener = urllib.request.build_opener()
|
||||||
|
opener.addheaders = image_request_headers
|
||||||
|
urllib.request.install_opener(opener)
|
||||||
|
source = urllib.request.urlretrieve(url, filename)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user