BloodyInside's picture
new
cdc95a6
raw
history blame
3.1 kB
from bs4 import BeautifulSoup
from pprint import pprint
from ..utils import SeleniumScraper
from core.settings import BASE_DIR
import os, threading, uuid, time, sqlite3,sys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from backend.module.utils import date_utils
scraper = None
def scrap(id:int=1):
if not id: raise ValueError("The 'id' parameter is required.")
global scraper
try:
url = f"https://www.colamanga.com/{id}/"
if not scraper: scraper = SeleniumScraper()
driver = scraper.driver()
driver.get(url)
DATA = {}
DATA["id"] = id
# Get info
cover_url = driver.find_element(By.CLASS_NAME, "fed-list-pics").get_attribute("data-original")
cover_url_split = cover_url.split("/")
cover_id = cover_url_split[len(cover_url_split)-2]
DATA["cover"] = f"/api/web_scrap/get_cover/colamanga/{id}/{cover_id}/"
content_info_element = driver.find_element(By.CLASS_NAME, "fed-deta-content")
DATA["title"] = content_info_element.find_element(By.TAG_NAME, "h1").text
li_info_elements = content_info_element.find_element(By.TAG_NAME, "ul").find_elements(By.CLASS_NAME, "fed-col-md6")
DATA["status"] = li_info_elements[0].find_element(By.TAG_NAME,"a").text
DATA["author"] = li_info_elements[1].find_element(By.TAG_NAME,"a").text
DATA["updated"] = li_info_elements[2].find_element(By.TAG_NAME,"a").text
category_li = li_info_elements[4].find_elements(By.TAG_NAME,"a")
array = []
for c in category_li:
array.append(c.text)
DATA["category"] = array
DATA["synopsis"] = driver.find_element(By.CLASS_NAME, "fed-tabs-boxs").find_element(By.CSS_SELECTOR, "p.fed-text-muted").get_attribute('innerHTML')
ul_element = BeautifulSoup(driver.find_element(By.CLASS_NAME, "all_data_list").find_element(By.TAG_NAME, "ul").get_attribute('innerHTML'), 'html.parser')
li_elements = ul_element.find_all('li')
chapter_array = []
for li in li_elements:
a_element = li.find('a')
obj = {
"idx": int(a_element.get('href').split("/")[-1].split(".")[0]),
"title": a_element.get('title'),
"id": a_element.get('href').lstrip("/")
}
chapter_array.append(obj)
DATA["chapters"] = chapter_array
return DATA
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
line_number = exc_tb.tb_lineno
print(f"Error on line {line_number}: {e}")
raise Exception(e)
finally: pass
if __name__ == "__main__":
DATA = scrap(page=1,search="妖")
# with open("./temp.html","w", encoding='utf-8') as f:
# f.write(ul.prettify()) # Write each element prettified
# pprint(DATA)