Spaces:
Configuration error
Configuration error
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.wait import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
import urllib | |
import time | |
import os | |
import json | |
import random | |
driver = webdriver.Chrome() | |
existing = set([int(name.replace('.json', '')) for name in os.listdir('./anime')]) | |
# total = set(range(100000)) | |
# difference = total.difference(existing) | |
difference = range(3210, 10000) | |
for i in difference: | |
random_num = random.choice([4]) | |
driver.get(f"https://myanimelist.net/anime/{i}/") | |
try: | |
try: | |
wait = WebDriverWait(driver, 5) | |
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='leftside']//a/img"))) | |
image = driver.find_element(By.XPATH, "//div[@class='leftside']//a/img").get_attribute('src') | |
urllib.request.urlretrieve(image, f"./images/{i}.jpg") | |
except: | |
pass | |
try: | |
body = driver.find_element(By.TAG_NAME, 'body') | |
except: | |
body = '' | |
try: | |
description = driver.find_element(By.XPATH, "//p[@itemprop='description']").text | |
except: | |
description = '' | |
try: | |
synonyms = driver.find_element(By.XPATH, "//span[text() = 'Synonyms:']/..").text.replace('Synonyms:', '').strip() | |
except: | |
synonyms = '' | |
try: | |
japanese = driver.find_element(By.XPATH, "//span[text() = 'Japanese:']/..").text.replace('Japanese:', '').strip() | |
except: | |
japanese = '' | |
try: | |
driver.find_element(By.CLASS_NAME, 'js-anime-toggle-alternative-title-button').click() | |
english = driver.find_element(By.XPATH, "//span[text() = 'English:']/..").text.replace('English:', '').strip() | |
except: | |
english = '' | |
try: | |
type = driver.find_element(By.XPATH, "//span[text() = 'Type:']/..").text.replace('Type:', '').strip() | |
except: | |
type = '' | |
try: | |
episodes = driver.find_element(By.XPATH, "//span[text() = 'Episodes:']/..").text.replace('Episodes:', '').strip() | |
except: | |
episodes = '' | |
try: | |
premiered = driver.find_element(By.XPATH, "//span[text() = 'Premiered:']/..").text.replace('Premiered:', '').strip() | |
except: | |
premiered = '' | |
try: | |
broadcast = driver.find_element(By.XPATH, "//span[text() = 'Broadcast:']/..").text.replace('Broadcast:', '').strip() | |
except: | |
broadcast = '' | |
try: | |
producers = driver.find_element(By.XPATH, "//span[text() = 'Producers:']/..").text.replace('Producers:', '').strip() | |
except: | |
producers = '' | |
try: | |
licensors = driver.find_element(By.XPATH, "//span[text() = 'Licensors:']/..").text.replace('Licensors:', '').strip() | |
except: | |
licensors = '' | |
try: | |
studios = driver.find_element(By.XPATH, "//span[text() = 'Studios:']/..").text.replace('Studios:', '').strip() | |
except: | |
studios = '' | |
try: | |
source = driver.find_element(By.XPATH, "//span[text() = 'Source:']/..").text.replace('Source:', '').strip() | |
except: | |
source = '' | |
try: | |
genres = driver.find_element(By.XPATH, "//span[text() = 'Genres:']/..").text.replace('Genres:', '').strip() | |
except: | |
genres = '' | |
try: | |
themes = driver.find_element(By.XPATH, "//span[text() = 'Themes:']/..").text.replace('Themes:', '').strip() | |
except: | |
try: | |
themes = driver.find_element(By.XPATH, "//span[text() = 'Theme:']/..").text.replace('Theme:', '').strip() | |
except: | |
themes = '' | |
try: | |
demographic = driver.find_element(By.XPATH, "//span[text() = 'Demographic:']/..").text.replace('Demographic:', '').strip() | |
except: | |
demographic = '' | |
try: | |
duration = driver.find_element(By.XPATH, "//span[text() = 'Duration:']/..").text.replace('Duration:', '').strip() | |
except: | |
duration = '' | |
try: | |
rating = driver.find_element(By.XPATH, "//span[text() = 'Rating:']/..").text.replace('Rating:', '').strip() | |
except: | |
rating = '' | |
time.sleep(2) | |
try: | |
wait = WebDriverWait(driver, 10) | |
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']"))) | |
driver.find_element(By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']").click() | |
except Exception as e: | |
print(e) | |
try: | |
driver.find_element(By.CLASS_NAME, 'error404') | |
with open(f"anime/{i}.json", "w") as outfile: | |
json.dump({}, outfile) | |
continue | |
except Exception as e: | |
print(e) | |
driver.close() | |
time.sleep(150) | |
driver = webdriver.Chrome() | |
continue | |
driver.execute_script("window.scrollTo(0, 0)") | |
data = [] | |
reviews = driver.find_elements(By.CLASS_NAME, 'js-review-element') | |
for review in reviews: | |
visible = review.find_element(By.CLASS_NAME, 'text') | |
sentiment = review.find_element(By.CLASS_NAME, 'tag') | |
wait = WebDriverWait(driver, 10) | |
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'js-hidden'))) | |
hidden = review.find_element(By.CLASS_NAME, 'js-hidden') | |
text = visible.text + hidden.get_attribute('textContent') | |
text = text.strip().replace('\n', ' ') | |
data.append({ | |
'sentiment': sentiment.text, | |
'text': text | |
}) | |
reviews = data | |
if not reviews: | |
continue | |
information = { | |
'synonyms': synonyms, | |
'japanese': japanese, | |
'english': english, | |
'type': type, | |
'episodes': episodes, | |
'premiered': premiered, | |
'broadcast': broadcast, | |
'producers': [x.strip() for x in producers.split(',')], | |
'licensors': [x.strip() for x in licensors.split(',')], | |
'studios': [x.strip() for x in studios.split(',')], | |
'source': [x.strip() for x in source.split(',')], | |
'genres': [x.strip() for x in genres.split(',')], | |
'themes': [x.strip() for x in themes.split(',')], | |
'demographic': demographic.split(','), | |
'duration': duration, | |
'rating': rating, | |
'description': description, | |
'reviews': reviews | |
} | |
with open(f"anime/{i}.json", "w") as outfile: | |
json.dump(information, outfile) | |
time.sleep(random_num) | |
except Exception as e: | |
print(e) | |
time.sleep(random_num) | |
driver.close() |