Spaces:

BloodyInside
/

ComicMTL

Sleeping

App Files Files Community

ComicMTL / backend /module /web_scrap /ColaManga /get.py

BloodyInside

new

cdc95a6 6 months ago

raw

history blame

3.1 kB

	from bs4 import BeautifulSoup
	from pprint import pprint
	from ..utils import SeleniumScraper
	from core.settings import BASE_DIR
	import os, threading, uuid, time, sqlite3,sys

	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC

	from backend.module.utils import date_utils


	scraper = None

	def scrap(id:int=1):
	if not id: raise ValueError("The 'id' parameter is required.")
	global scraper

	try:
	url = f"https://www.colamanga.com/{id}/"

	if not scraper: scraper = SeleniumScraper()
	driver = scraper.driver()
	driver.get(url)

	DATA = {}

	DATA["id"] = id

	# Get info

	cover_url = driver.find_element(By.CLASS_NAME, "fed-list-pics").get_attribute("data-original")
	cover_url_split = cover_url.split("/")
	cover_id = cover_url_split[len(cover_url_split)-2]
	DATA["cover"] = f"/api/web_scrap/get_cover/colamanga/{id}/{cover_id}/"


	content_info_element = driver.find_element(By.CLASS_NAME, "fed-deta-content")
	DATA["title"] = content_info_element.find_element(By.TAG_NAME, "h1").text
	li_info_elements = content_info_element.find_element(By.TAG_NAME, "ul").find_elements(By.CLASS_NAME, "fed-col-md6")

	DATA["status"] = li_info_elements[0].find_element(By.TAG_NAME,"a").text
	DATA["author"] = li_info_elements[1].find_element(By.TAG_NAME,"a").text
	DATA["updated"] = li_info_elements[2].find_element(By.TAG_NAME,"a").text

	category_li = li_info_elements[4].find_elements(By.TAG_NAME,"a")
	array = []
	for c in category_li:
	array.append(c.text)
	DATA["category"] = array


	DATA["synopsis"] = driver.find_element(By.CLASS_NAME, "fed-tabs-boxs").find_element(By.CSS_SELECTOR, "p.fed-text-muted").get_attribute('innerHTML')

	ul_element = BeautifulSoup(driver.find_element(By.CLASS_NAME, "all_data_list").find_element(By.TAG_NAME, "ul").get_attribute('innerHTML'), 'html.parser')
	li_elements = ul_element.find_all('li')

	chapter_array = []
	for li in li_elements:
	a_element = li.find('a')
	obj = {
	"idx": int(a_element.get('href').split("/")[-1].split(".")[0]),
	"title": a_element.get('title'),
	"id": a_element.get('href').lstrip("/")
	}
	chapter_array.append(obj)

	DATA["chapters"] = chapter_array


	return DATA
	except Exception as e:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	line_number = exc_tb.tb_lineno
	print(f"Error on line {line_number}: {e}")
	raise Exception(e)
	finally: pass

	if __name__ == "__main__":
	DATA = scrap(page=1,search="妖")


	# with open("./temp.html","w", encoding='utf-8') as f:

	# f.write(ul.prettify()) # Write each element prettified


	# pprint(DATA)