import mechanicalsoup import requests from bs4 import BeautifulSoup import re import os from os import path import pathlib import time if not os.path.exists('./NEDDb/'): os.makedirs('./NEDDb/') if not os.path.exists('./Dupes/'): os.makedirs('./Dupes/') b = mechanicalsoup.StatefulBrowser() url = "https://www.nedesigns.com/rct2-objects/" print datNames = [] linkList = [] while True: try: print ("To avoid scanning 86 pages of 64 broken object links each, enter a value of '87' or slightly higher. Currently, when sorted by downloads ascending, all objects on pages leading up to 87 are null.") start = int(input("Enter page number to start on: ")) print("To scrape all objects, enter the value of the total number of pages of objects in https://www.nedesigns.com/rct2-objects/ ('435' or higher suggested).") end = int(input("Enter page to end on: ")) if start < 1 or end < 1 or start > end: print('Error: Invalid values entered. Please put a value greater than zero, and be sure that the starting page is less than the ending page.') else: print("This process can last several minutes to several hours, depending the range entered above. I recommend hydrating, brushing teeth, stretching, or anything else gives your eyes a break from staring at the screen, and checking back in a few minutes.") print("Notice: Inaccessible downloads are prevented from saving. All 404 pages on NEDesigns.com are between 167,870 and 169,750 bytes on disk, and only one legitimate object gets close to this amount, so that byte range gets rejected. This feature may become moot if NEDesigns changes the size of the html on its 404 page or base UI to exceed or drop below this range.") break print() except: print("Error: Invalid values entered. Please put a value greater than zero, and be sure that the starting page is less than the ending page.") for i in range(start, end + 1): full_url = f"{url}page-{i}?order=da" b.open(full_url) ext = re.compile('.*rct2-object\/\d*\/') links = b.get_current_page().find_all('a') links = set([x.attrs['href'] for x in links if ext.match(x.attrs['href']) and str(x.attrs['href']).count('/') <= 6]) for num, link in enumerate(links): surl = link page = requests.get(surl) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find('div', {'class', "content-header-wrap"}) contentFix = str(results.contents[1]).split('/ ')[1][:-5] tempLink = '' if '&' in contentFix: cd = contentFix.replace('&', '&') CC = '/' + contentFix + '/download' tempLink = '/'.join(link.split('/')[:-1]) + CC datNames.append(cd) else: tempLink = link + 'download' datNames.append(contentFix) linkList.append(tempLink) loadingBar = '[..................................................................]' print(loadingBar) value = len(linkList) container = [] for num, link_url in enumerate(linkList): if num % int(value/ 66) == 0: loadingBar = loadingBar.replace('.', '|', 1) os.system('cls') print(loadingBar) print(str(int(num / value * 99)) + "%") file = datNames[num] if file not in container: container.append(file) filename = str(file) + '~(' + link_url.split('/')[-3].upper() + ').DAT' open("./NEDDb/" + str(filename), 'a').close mLink = '' + link_url + '' soup = BeautifulSoup(mLink,'html.parser') mLink = soup.find_all('') r = requests.get(link_url, stream=True) with open("./NEDDb/" + str(filename), 'wb') as f: for chunk in r.iter_content(chunk_size = 1024): if chunk: f.write(chunk) else: container.append(file) filename = str(file) + '~(' + link_url.split('/')[-3].upper() + ').DAT' open("./Dupes/" + str(filename), 'a').close r = requests.get(link_url, stream=True) with open("./Dupes/" + str(filename), 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) print('[||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||]') print('100%') print("All done! Downloaded objects should be in a folder called 'NEDDb' in the same location as this application.") print("If multiple files with the same name were downloaded, those went into a folder called 'Dupes' (created alongside the NEDDb folder). These objects are codified by suffixing their object number in the NEDesigns database to their filenames.") print("If you want to use these files, please move the 'NEDDb' and 'Dupes' folders into the directory whence custom objects should be stored.") print("Thanks for using this tool! If you have any questions, contact OPStellar on NEDesigns.com or its Discord server, or post an issue/suggestion on github.com/OPStellar/NEDDb and I'll address it when I can.") print("You are now safe to terminate this application.") print("Peace and carrots,") print("Sir Vyvre the Lone") input()