Moved utils and headers to their own files
This commit is contained in:
parent
a6cbc0f4ae
commit
b37105afa9
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
|||||||
/logs/
|
/logs/
|
||||||
/downloads/
|
/downloads/
|
||||||
/already_saved/
|
/already_saved/
|
||||||
|
__pycache__/
|
||||||
|
150
grab.py
150
grab.py
@ -3,106 +3,18 @@ import mimetypes
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import unicodedata
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import time
|
import time
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
|
from util import *
|
||||||
|
from headers import *
|
||||||
|
|
||||||
# SYNPOSIS:
|
# SYNPOSIS:
|
||||||
# To download posts from an artist:
|
# To download posts from an artist:
|
||||||
# python3 grab.py mixppl
|
# python3 grab.py mixppl
|
||||||
|
|
||||||
class bcolors:
|
|
||||||
HEADER = '\033[95m'
|
|
||||||
OKBLUE = '\033[94m'
|
|
||||||
OKCYAN = '\033[96m'
|
|
||||||
OKGREEN = '\033[92m'
|
|
||||||
WARNING = '\033[93m'
|
|
||||||
FAIL = '\033[91m'
|
|
||||||
ENDC = '\033[0m'
|
|
||||||
BOLD = '\033[1m'
|
|
||||||
UNDERLINE = '\033[4m'
|
|
||||||
|
|
||||||
def logMsg(msg, mode):
|
|
||||||
|
|
||||||
col = 0
|
|
||||||
prefix = 0
|
|
||||||
|
|
||||||
if mode == "okdl":
|
|
||||||
col = bcolors.OKCYAN
|
|
||||||
prefix = "[OK_DL ]"
|
|
||||||
|
|
||||||
elif mode == "okndl":
|
|
||||||
col = bcolors.OKBLUE
|
|
||||||
prefix = "[OK_NO_DL]"
|
|
||||||
|
|
||||||
elif mode == "warn":
|
|
||||||
col = bcolors.WARNING
|
|
||||||
prefix = "[WARNING ]"
|
|
||||||
|
|
||||||
elif mode == "err":
|
|
||||||
col = bcolors.FAIL
|
|
||||||
prefix = "[ERROR ]"
|
|
||||||
else:
|
|
||||||
print(bcolors.FAIL + "SUPPLIED INVALID LOG MODE!!! USE EITHER okdl, okndl, warn, or err!")
|
|
||||||
|
|
||||||
timestamp = getCurrentTimestamp()
|
|
||||||
|
|
||||||
# Log to console
|
|
||||||
print(col + f"[{timestamp}][{artist_name}]: " + msg)
|
|
||||||
|
|
||||||
# Log to logfile
|
|
||||||
logfile = open("./logs/" + slugify(artist_name) + ".txt", "a") # Open existing or create
|
|
||||||
logfile.write(prefix + " " + "[" + timestamp + "]: " + msg + "\n")
|
|
||||||
logfile.close()
|
|
||||||
|
|
||||||
|
|
||||||
def extensionFromUrl(url):
|
|
||||||
rurl = url[::-1]
|
|
||||||
rext = ""
|
|
||||||
for c in rurl:
|
|
||||||
if c != '.':
|
|
||||||
rext = rext + c
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
ext = rext[::-1]
|
|
||||||
|
|
||||||
# Now remove the get parameters
|
|
||||||
foundQuestionmark = False
|
|
||||||
actualExt = ""
|
|
||||||
for c in ext:
|
|
||||||
if c == '?':
|
|
||||||
foundQuestionmark = True
|
|
||||||
|
|
||||||
if not foundQuestionmark:
|
|
||||||
actualExt = actualExt + c
|
|
||||||
|
|
||||||
return actualExt
|
|
||||||
|
|
||||||
|
|
||||||
def slugify(value, allow_unicode=False):
|
|
||||||
"""
|
|
||||||
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
|
||||||
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
|
|
||||||
dashes to single dashes. Remove characters that aren't alphanumerics,
|
|
||||||
underscores, or hyphens. Convert to lowercase. Also strip leading and
|
|
||||||
trailing whitespace, dashes, and underscores.
|
|
||||||
"""
|
|
||||||
value = str(value)
|
|
||||||
if allow_unicode:
|
|
||||||
value = unicodedata.normalize('NFKC', value)
|
|
||||||
else:
|
|
||||||
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
|
|
||||||
value = re.sub(r'[^\w\s-]', '', value.lower())
|
|
||||||
return re.sub(r'[-\s]+', '-', value).strip('-_')
|
|
||||||
|
|
||||||
|
|
||||||
def getCurrentTimestamp():
|
|
||||||
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
|
|
||||||
|
|
||||||
def isPostAlreadySaved(post_id):
|
def isPostAlreadySaved(post_id):
|
||||||
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
|
||||||
|
|
||||||
@ -134,42 +46,6 @@ def downloadMedia(url, filename):
|
|||||||
urllib.request.install_opener(opener)
|
urllib.request.install_opener(opener)
|
||||||
source = urllib.request.urlretrieve(asset_image_url, filename)
|
source = urllib.request.urlretrieve(asset_image_url, filename)
|
||||||
|
|
||||||
|
|
||||||
project_fetch_headers = {
|
|
||||||
'authority': 'www.artstation.com',
|
|
||||||
'pragma': 'no-cache',
|
|
||||||
'cache-control': 'no-cache',
|
|
||||||
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': '"Windows"',
|
|
||||||
'upgrade-insecure-requests': '1',
|
|
||||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
|
|
||||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
||||||
'sec-fetch-site': 'none',
|
|
||||||
'sec-fetch-mode': 'navigate',
|
|
||||||
'sec-fetch-user': '?1',
|
|
||||||
'sec-fetch-dest': 'document',
|
|
||||||
'accept-language': 'de-DE,de;q=0.9',
|
|
||||||
'authority': 'api.reddit.com'
|
|
||||||
}
|
|
||||||
|
|
||||||
image_request_headers = [
|
|
||||||
('authority', 'cdna.artstation.com'),
|
|
||||||
('pragma', 'no-cache'),
|
|
||||||
('cache-control', 'no-cache'),
|
|
||||||
('sec-ch-ua', '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"'),
|
|
||||||
('sec-ch-ua-mobile', '?0'),
|
|
||||||
('sec-ch-ua-platform', '"Windows"'),
|
|
||||||
('upgrade-insecure-requests', '1'),
|
|
||||||
('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'),
|
|
||||||
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'),
|
|
||||||
('sec-fetch-site', 'none'),
|
|
||||||
('sec-fetch-mode', 'navigate'),
|
|
||||||
('sec-fetch-user', '?1'),
|
|
||||||
('sec-fetch-dest', 'document'),
|
|
||||||
('accept-language', 'de-DE,de;q=0.9')
|
|
||||||
]
|
|
||||||
|
|
||||||
# 2 minute timeout in case something gets stuck.
|
# 2 minute timeout in case something gets stuck.
|
||||||
socket.setdefaulttimeout(120)
|
socket.setdefaulttimeout(120)
|
||||||
|
|
||||||
@ -191,7 +67,7 @@ lastPageReached = False
|
|||||||
pageCounter = 1
|
pageCounter = 1
|
||||||
try:
|
try:
|
||||||
while not lastPageReached:
|
while not lastPageReached:
|
||||||
logMsg(f"Fetching page {pageCounter} of {artist_name}...", "okndl")
|
logMsg(f"Fetching page {pageCounter} of {artist_name}...", "okndl", artist_name)
|
||||||
projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json?page={pageCounter}", headers=project_fetch_headers)
|
projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json?page={pageCounter}", headers=project_fetch_headers)
|
||||||
projects = projects_data.json()["data"]
|
projects = projects_data.json()["data"]
|
||||||
|
|
||||||
@ -201,9 +77,9 @@ try:
|
|||||||
|
|
||||||
if not lastPageReached:
|
if not lastPageReached:
|
||||||
pageCounter = pageCounter + 1
|
pageCounter = pageCounter + 1
|
||||||
logMsg(f"Page contains {page_num_projects} projects...", "okndl")
|
logMsg(f"Page contains {page_num_projects} projects...", "okndl", artist_name)
|
||||||
else:
|
else:
|
||||||
logMsg(f"Page contains {page_num_projects} projects... That's the last page!", "okndl")
|
logMsg(f"Page contains {page_num_projects} projects... That's the last page!", "okndl", artist_name)
|
||||||
|
|
||||||
|
|
||||||
# For each project in all of the artists projects
|
# For each project in all of the artists projects
|
||||||
@ -211,7 +87,7 @@ try:
|
|||||||
project_name = project["title"]
|
project_name = project["title"]
|
||||||
project_hash_id = project["hash_id"]
|
project_hash_id = project["hash_id"]
|
||||||
|
|
||||||
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl")
|
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
|
||||||
|
|
||||||
# Have we already downloaded this post?
|
# Have we already downloaded this post?
|
||||||
if not isPostAlreadySaved(project_hash_id):
|
if not isPostAlreadySaved(project_hash_id):
|
||||||
@ -232,23 +108,23 @@ try:
|
|||||||
# Generate a download filename
|
# Generate a download filename
|
||||||
filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
|
filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
|
||||||
|
|
||||||
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl")
|
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl", artist_name)
|
||||||
|
|
||||||
# Download it
|
# Download it
|
||||||
downloadMedia(asset_image_url, filename)
|
downloadMedia(asset_image_url, filename)
|
||||||
else:
|
else:
|
||||||
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl")
|
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
|
||||||
|
|
||||||
# After downloading all assets, mark the project as downloaded.
|
# After downloading all assets, mark the project as downloaded.
|
||||||
markPostAsSaved(project_hash_id)
|
markPostAsSaved(project_hash_id)
|
||||||
|
|
||||||
# Project is already downloaded
|
# Project is already downloaded
|
||||||
else:
|
else:
|
||||||
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] because it is already downloaded.", "okndl")
|
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] because it is already downloaded.", "okndl", artist_name)
|
||||||
|
|
||||||
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl")
|
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
|
||||||
|
|
||||||
except socket.timeout:
|
except socket.timeout:
|
||||||
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err")
|
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
|
||||||
except:
|
except:
|
||||||
logMsg("Failed for some reason!", "err")
|
logMsg("Failed for some reason!", "err", artist_name)
|
||||||
|
34
headers.py
Normal file
34
headers.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
project_fetch_headers = {
|
||||||
|
'authority': 'www.artstation.com',
|
||||||
|
'pragma': 'no-cache',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'upgrade-insecure-requests': '1',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
|
||||||
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||||
|
'sec-fetch-site': 'none',
|
||||||
|
'sec-fetch-mode': 'navigate',
|
||||||
|
'sec-fetch-user': '?1',
|
||||||
|
'sec-fetch-dest': 'document',
|
||||||
|
'accept-language': 'de-DE,de;q=0.9',
|
||||||
|
'authority': 'api.reddit.com'
|
||||||
|
}
|
||||||
|
|
||||||
|
image_request_headers = [
|
||||||
|
('authority', 'cdna.artstation.com'),
|
||||||
|
('pragma', 'no-cache'),
|
||||||
|
('cache-control', 'no-cache'),
|
||||||
|
('sec-ch-ua', '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"'),
|
||||||
|
('sec-ch-ua-mobile', '?0'),
|
||||||
|
('sec-ch-ua-platform', '"Windows"'),
|
||||||
|
('upgrade-insecure-requests', '1'),
|
||||||
|
('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'),
|
||||||
|
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'),
|
||||||
|
('sec-fetch-site', 'none'),
|
||||||
|
('sec-fetch-mode', 'navigate'),
|
||||||
|
('sec-fetch-user', '?1'),
|
||||||
|
('sec-fetch-dest', 'document'),
|
||||||
|
('accept-language', 'de-DE,de;q=0.9')
|
||||||
|
]
|
94
util.py
Normal file
94
util.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
import unicodedata
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
|
||||||
|
class bcolors:
|
||||||
|
HEADER = '\033[95m'
|
||||||
|
OKBLUE = '\033[94m'
|
||||||
|
OKCYAN = '\033[96m'
|
||||||
|
OKGREEN = '\033[92m'
|
||||||
|
WARNING = '\033[93m'
|
||||||
|
FAIL = '\033[91m'
|
||||||
|
ENDC = '\033[0m'
|
||||||
|
BOLD = '\033[1m'
|
||||||
|
UNDERLINE = '\033[4m'
|
||||||
|
|
||||||
|
def logMsg(msg, mode, artist_name):
|
||||||
|
|
||||||
|
col = 0
|
||||||
|
prefix = 0
|
||||||
|
|
||||||
|
if mode == "okdl":
|
||||||
|
col = bcolors.OKCYAN
|
||||||
|
prefix = "[OK_DL ]"
|
||||||
|
|
||||||
|
elif mode == "okndl":
|
||||||
|
col = bcolors.OKBLUE
|
||||||
|
prefix = "[OK_NO_DL]"
|
||||||
|
|
||||||
|
elif mode == "warn":
|
||||||
|
col = bcolors.WARNING
|
||||||
|
prefix = "[WARNING ]"
|
||||||
|
|
||||||
|
elif mode == "err":
|
||||||
|
col = bcolors.FAIL
|
||||||
|
prefix = "[ERROR ]"
|
||||||
|
else:
|
||||||
|
print(bcolors.FAIL + "SUPPLIED INVALID LOG MODE!!! USE EITHER okdl, okndl, warn, or err!")
|
||||||
|
|
||||||
|
timestamp = getCurrentTimestamp()
|
||||||
|
|
||||||
|
# Log to console
|
||||||
|
print(col + f"[{timestamp}][{artist_name}]: " + msg)
|
||||||
|
|
||||||
|
# Log to logfile
|
||||||
|
logfile = open("./logs/" + slugify(artist_name) + ".txt", "a") # Open existing or create
|
||||||
|
logfile.write(prefix + " " + "[" + timestamp + "]: " + msg + "\n")
|
||||||
|
logfile.close()
|
||||||
|
|
||||||
|
|
||||||
|
def extensionFromUrl(url):
|
||||||
|
rurl = url[::-1]
|
||||||
|
rext = ""
|
||||||
|
for c in rurl:
|
||||||
|
if c != '.':
|
||||||
|
rext = rext + c
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
ext = rext[::-1]
|
||||||
|
|
||||||
|
# Now remove the get parameters
|
||||||
|
foundQuestionmark = False
|
||||||
|
actualExt = ""
|
||||||
|
for c in ext:
|
||||||
|
if c == '?':
|
||||||
|
foundQuestionmark = True
|
||||||
|
|
||||||
|
if not foundQuestionmark:
|
||||||
|
actualExt = actualExt + c
|
||||||
|
|
||||||
|
return actualExt
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(value, allow_unicode=False):
|
||||||
|
"""
|
||||||
|
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
||||||
|
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
|
||||||
|
dashes to single dashes. Remove characters that aren't alphanumerics,
|
||||||
|
underscores, or hyphens. Convert to lowercase. Also strip leading and
|
||||||
|
trailing whitespace, dashes, and underscores.
|
||||||
|
"""
|
||||||
|
value = str(value)
|
||||||
|
if allow_unicode:
|
||||||
|
value = unicodedata.normalize('NFKC', value)
|
||||||
|
else:
|
||||||
|
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
|
||||||
|
value = re.sub(r'[^\w\s-]', '', value.lower())
|
||||||
|
return re.sub(r'[-\s]+', '-', value).strip('-_')
|
||||||
|
|
||||||
|
|
||||||
|
def getCurrentTimestamp():
|
||||||
|
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
|
Loading…
x
Reference in New Issue
Block a user