Moved utils and headers to their own files

This commit is contained in:
Leonetienne 2022-01-30 19:19:50 +01:00
parent a6cbc0f4ae
commit b37105afa9
4 changed files with 142 additions and 138 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
/logs/ /logs/
/downloads/ /downloads/
/already_saved/ /already_saved/
__pycache__/

150
grab.py
View File

@ -3,106 +3,18 @@ import mimetypes
import os import os
import sys import sys
import urllib.request import urllib.request
import unicodedata
import re
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import time import time
import socket import socket
from util import *
from headers import *
# SYNPOSIS: # SYNPOSIS:
# To download posts from an artist: # To download posts from an artist:
# python3 grab.py mixppl # python3 grab.py mixppl
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def logMsg(msg, mode):
col = 0
prefix = 0
if mode == "okdl":
col = bcolors.OKCYAN
prefix = "[OK_DL ]"
elif mode == "okndl":
col = bcolors.OKBLUE
prefix = "[OK_NO_DL]"
elif mode == "warn":
col = bcolors.WARNING
prefix = "[WARNING ]"
elif mode == "err":
col = bcolors.FAIL
prefix = "[ERROR ]"
else:
print(bcolors.FAIL + "SUPPLIED INVALID LOG MODE!!! USE EITHER okdl, okndl, warn, or err!")
timestamp = getCurrentTimestamp()
# Log to console
print(col + f"[{timestamp}][{artist_name}]: " + msg)
# Log to logfile
logfile = open("./logs/" + slugify(artist_name) + ".txt", "a") # Open existing or create
logfile.write(prefix + " " + "[" + timestamp + "]: " + msg + "\n")
logfile.close()
def extensionFromUrl(url):
rurl = url[::-1]
rext = ""
for c in rurl:
if c != '.':
rext = rext + c
else:
break
ext = rext[::-1]
# Now remove the get parameters
foundQuestionmark = False
actualExt = ""
for c in ext:
if c == '?':
foundQuestionmark = True
if not foundQuestionmark:
actualExt = actualExt + c
return actualExt
def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')
def getCurrentTimestamp():
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")
def isPostAlreadySaved(post_id): def isPostAlreadySaved(post_id):
idset_filename = "./already_saved/" + slugify(artist_name) + ".txt" idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"
@ -134,42 +46,6 @@ def downloadMedia(url, filename):
urllib.request.install_opener(opener) urllib.request.install_opener(opener)
source = urllib.request.urlretrieve(asset_image_url, filename) source = urllib.request.urlretrieve(asset_image_url, filename)
project_fetch_headers = {
'authority': 'www.artstation.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'de-DE,de;q=0.9',
'authority': 'api.reddit.com'
}
image_request_headers = [
('authority', 'cdna.artstation.com'),
('pragma', 'no-cache'),
('cache-control', 'no-cache'),
('sec-ch-ua', '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"'),
('sec-ch-ua-mobile', '?0'),
('sec-ch-ua-platform', '"Windows"'),
('upgrade-insecure-requests', '1'),
('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'),
('sec-fetch-site', 'none'),
('sec-fetch-mode', 'navigate'),
('sec-fetch-user', '?1'),
('sec-fetch-dest', 'document'),
('accept-language', 'de-DE,de;q=0.9')
]
# 2 minute timeout in case something gets stuck. # 2 minute timeout in case something gets stuck.
socket.setdefaulttimeout(120) socket.setdefaulttimeout(120)
@ -191,7 +67,7 @@ lastPageReached = False
pageCounter = 1 pageCounter = 1
try: try:
while not lastPageReached: while not lastPageReached:
logMsg(f"Fetching page {pageCounter} of {artist_name}...", "okndl") logMsg(f"Fetching page {pageCounter} of {artist_name}...", "okndl", artist_name)
projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json?page={pageCounter}", headers=project_fetch_headers) projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json?page={pageCounter}", headers=project_fetch_headers)
projects = projects_data.json()["data"] projects = projects_data.json()["data"]
@ -201,9 +77,9 @@ try:
if not lastPageReached: if not lastPageReached:
pageCounter = pageCounter + 1 pageCounter = pageCounter + 1
logMsg(f"Page contains {page_num_projects} projects...", "okndl") logMsg(f"Page contains {page_num_projects} projects...", "okndl", artist_name)
else: else:
logMsg(f"Page contains {page_num_projects} projects... That's the last page!", "okndl") logMsg(f"Page contains {page_num_projects} projects... That's the last page!", "okndl", artist_name)
# For each project in all of the artists projects # For each project in all of the artists projects
@ -211,7 +87,7 @@ try:
project_name = project["title"] project_name = project["title"]
project_hash_id = project["hash_id"] project_hash_id = project["hash_id"]
logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl") logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)
# Have we already downloaded this post? # Have we already downloaded this post?
if not isPostAlreadySaved(project_hash_id): if not isPostAlreadySaved(project_hash_id):
@ -232,23 +108,23 @@ try:
# Generate a download filename # Generate a download filename
filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url) filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl") logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl", artist_name)
# Download it # Download it
downloadMedia(asset_image_url, filename) downloadMedia(asset_image_url, filename)
else: else:
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl") logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)
# After downloading all assets, mark the project as downloaded. # After downloading all assets, mark the project as downloaded.
markPostAsSaved(project_hash_id) markPostAsSaved(project_hash_id)
# Project is already downloaded # Project is already downloaded
else: else:
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] because it is already downloaded.", "okndl") logMsg(f"Skipping project '{project_name}' [{project_hash_id}] because it is already downloaded.", "okndl", artist_name)
logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl") logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)
except socket.timeout: except socket.timeout:
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err") logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
except: except:
logMsg("Failed for some reason!", "err") logMsg("Failed for some reason!", "err", artist_name)

34
headers.py Normal file
View File

@ -0,0 +1,34 @@
project_fetch_headers = {
'authority': 'www.artstation.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'de-DE,de;q=0.9',
'authority': 'api.reddit.com'
}
image_request_headers = [
('authority', 'cdna.artstation.com'),
('pragma', 'no-cache'),
('cache-control', 'no-cache'),
('sec-ch-ua', '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"'),
('sec-ch-ua-mobile', '?0'),
('sec-ch-ua-platform', '"Windows"'),
('upgrade-insecure-requests', '1'),
('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'),
('sec-fetch-site', 'none'),
('sec-fetch-mode', 'navigate'),
('sec-fetch-user', '?1'),
('sec-fetch-dest', 'document'),
('accept-language', 'de-DE,de;q=0.9')
]

94
util.py Normal file
View File

@ -0,0 +1,94 @@
import unicodedata
from datetime import datetime
import time
from pathlib import Path
import re
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def logMsg(msg, mode, artist_name):
col = 0
prefix = 0
if mode == "okdl":
col = bcolors.OKCYAN
prefix = "[OK_DL ]"
elif mode == "okndl":
col = bcolors.OKBLUE
prefix = "[OK_NO_DL]"
elif mode == "warn":
col = bcolors.WARNING
prefix = "[WARNING ]"
elif mode == "err":
col = bcolors.FAIL
prefix = "[ERROR ]"
else:
print(bcolors.FAIL + "SUPPLIED INVALID LOG MODE!!! USE EITHER okdl, okndl, warn, or err!")
timestamp = getCurrentTimestamp()
# Log to console
print(col + f"[{timestamp}][{artist_name}]: " + msg)
# Log to logfile
logfile = open("./logs/" + slugify(artist_name) + ".txt", "a") # Open existing or create
logfile.write(prefix + " " + "[" + timestamp + "]: " + msg + "\n")
logfile.close()
def extensionFromUrl(url):
rurl = url[::-1]
rext = ""
for c in rurl:
if c != '.':
rext = rext + c
else:
break
ext = rext[::-1]
# Now remove the get parameters
foundQuestionmark = False
actualExt = ""
for c in ext:
if c == '?':
foundQuestionmark = True
if not foundQuestionmark:
actualExt = actualExt + c
return actualExt
def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')
def getCurrentTimestamp():
return datetime.utcfromtimestamp(time.time()).strftime("%m-%d-%Y %H-%M")