Artstation-grabber/grab.py

import requests
import mimetypes
import os
import sys
import urllib.request
from pathlib import Path
from datetime import datetime
import time
import socket

from util import *
from headers import *

# SYNPOSIS:
# To download posts from an artist:
# python3 grab.py mixppl

def isPostAlreadySaved(post_id):
    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"

    # Does the index file even exist yet?
    if not os.path.exists(idset_filename):
        return False

    # Open the index file
    index_file = open(idset_filename, "r") # Open existing or create

    # Store lines in array
    already_downloaded_post_ids = index_file.readlines()

    return (post_id + "\n") in already_downloaded_post_ids

def markPostAsSaved(post_id):
    idset_filename = "./already_saved/" + slugify(artist_name) + ".txt"

    # Open the index file
    index_file = open(idset_filename, "a") # Open existing or create
    index_file.write(post_id + "\n")
    index_file.close()


def downloadMedia(url, filename):
    # Prepare and execute query to download images
    opener = urllib.request.build_opener()
    opener.addheaders = image_request_headers
    urllib.request.install_opener(opener)
    source = urllib.request.urlretrieve(asset_image_url, filename)

# 2 minute timeout in case something gets stuck.
socket.setdefaulttimeout(120)

artist_name = str.lower(sys.argv[1])

# Create artist directory if it doesn't exist
artist_directory = "./downloads/" + slugify(artist_name) + "/"
Path(artist_directory).mkdir(parents=True, exist_ok=True)

# Create directory for already saved posts, and generate filename
Path("./already_saved/").mkdir(parents=True, exist_ok=True)

# Create directory for logging, and generate filename
Path("./logs/").mkdir(parents=True, exist_ok=True)


# Request project info for artist
lastPageReached = False
pageCounter = 1
try:
    while not lastPageReached:
        logMsg(f"Fetching page {pageCounter} of {artist_name}...", "okndl", artist_name)
        projects_data = requests.get(f"https://www.artstation.com/users/{artist_name}/projects.json?page={pageCounter}", headers=project_fetch_headers)
        projects = projects_data.json()["data"]

        page_num_projects = len(projects)

        lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page

        if not lastPageReached:
            pageCounter = pageCounter + 1
            logMsg(f"Page contains {page_num_projects} projects...", "okndl", artist_name)
        else:
            logMsg(f"Page contains {page_num_projects} projects... That's the last page!", "okndl", artist_name)


        # For each project in all of the artists projects
        for project in projects:
            project_name    = project["title"]
            project_hash_id = project["hash_id"]

            logMsg(f"Found project '{project_name}' with id {project_hash_id}. Fetching more info about it...", "okndl", artist_name)

            # Have we already downloaded this post?
            if not isPostAlreadySaved(project_hash_id):

                # Fetch information about the project
                project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
                assets = project_info.json()["assets"]

                # For each asset in the project (might be multiple images)
                for asset in assets:
                    asset_type = asset["asset_type"]

                    # If the asset is an image
                    if asset_type == "image":
                        asset_image_url = asset["image_url"]
                        asset_position = asset["position"]

                        # Generate a download filename
                        filename = artist_directory + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)

                        logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Downloading to '{filename}'...", "okdl", artist_name)

                        # Download it
                        downloadMedia(asset_image_url, filename)
                    else:
                        logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] at position {asset_position}. Skipping...", "okdl", artist_name)

                # After downloading all assets, mark the project as downloaded.
                markPostAsSaved(project_hash_id)

            # Project is already downloaded
            else:
                logMsg(f"Skipping project '{project_name}' [{project_hash_id}] because it is already downloaded.", "okndl", artist_name)

    logMsg(f"Finished all pages of {artist_name}... Total pages of this artist scanned: {pageCounter}", "okndl", artist_name)

except socket.timeout:
    logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", artist_name)
except:
    logMsg("Failed for some reason!", "err", artist_name)