Spaces:

BloodyInside
/

ComicMTL

Running

App Files Files Community

ComicMTL / backend /module /web_scrap /ColaManga /get_chapter.py

BloodyInside

new

cffd4ca 9 months ago

raw

history blame contribute delete

4.59 kB

	from bs4 import BeautifulSoup
	from pprint import pprint
	from ..utils import SeleniumScraper
	from selenium.webdriver.common.by import By
	from core.settings import BASE_DIR
	from PIL import Image


	from backend.module.utils import date_utils

	import json,time, threading,os, uuid, sqlite3, io, base64, sys
	MAX_TIMEOUT = 10

	scraper = None



	def __scrollToBottom(driver:object=None):
	if not driver: raise ValueError("The 'driver' argument is required.")

	timeout = date_utils.utc_time().add(60,'second').get()

	previous_height = 0
	scrolledY = 0
	while True:
	if date_utils.utc_time().get() >= timeout: raise Exception("[Get Chapter] Finding lastest element Timed out!")
	# Scroll to the bottom of the page
	driver.execute_script(f"window.scrollBy(0, {scrolledY});")

	current_height = driver.execute_script("return document.documentElement.scrollHeight")

	if current_height > previous_height:
	previous_height = current_height
	else:
	parent_div = driver.find_element(By.CLASS_NAME, "mh_mangalist")
	child_elements = parent_div.find_elements(By.XPATH, "./*")
	if child_elements[-1].get_attribute('text') != '__cad.read_periodical();': break
	scrolledY += 50



	def scrap(comic_id:str="",chapter_id:str="",output_dir:str=""):
	if not comic_id: raise ValueError("The 'comic_id' parameter is required.")
	if not chapter_id: raise ValueError("The 'chapter_id' parameter is required.")
	if not output_dir: raise ValueError("The 'output_dir' parameter is required.")
	global scraper


	try:
	url = f"https://www.colamanga.com/{chapter_id}"

	if not scraper: scraper = SeleniumScraper()
	driver = scraper.driver()
	driver.get(url)

	__scrollToBottom(driver=driver)

	parent_element = driver.find_element(By.ID, "mangalist")
	child_list = parent_element.find_elements(By.CLASS_NAME, "mh_comicpic")

	blob_list = []
	for child in child_list:
	timeout = date_utils.utc_time().add(5,'second').get()
	while True:
	if date_utils.utc_time().get() > timeout: break
	image_element = child.find_element(By.TAG_NAME, "img")
	driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth', block: 'center' });", image_element)

	url = image_element.get_attribute("src")
	if url:
	is_image_loaded = driver.execute_script(
	"return arguments[0].complete",
	image_element
	)
	if is_image_loaded:
	blob_list.append(url)
	break




	def process_browser_log_entry(entry):

	response = json.loads(entry['message'])['message']
	return response

	browser_log = driver.get_log('performance')
	events = [process_browser_log_entry(entry) for entry in browser_log]
	events = [event for event in events if 'Network.response' in event['method']]


	for e in events:
	if e.get("params").get("type") == "Image":
	url = e.get("params").get("response").get("url")
	if url.split(":")[0] == "blob":
	request_id = e["params"]["requestId"]
	response = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
	img = Image.open(io.BytesIO(base64.decodebytes(bytes(response.get("body"), "utf-8"))))

	chapter_id = chapter_id.split("/")[-1].split(".")[0]

	dir = os.path.join(output_dir)

	os.makedirs(dir, exist_ok=True)
	img.save(os.path.join(dir,f"{blob_list.index(url)}.png"))


	return {"status":"success"}
	except Exception as e:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	line_number = exc_tb.tb_lineno
	print(f"Error on line {line_number}: {e}")
	raise Exception(e)
	finally: pass

	if __name__ == "__main__":
	DATA = scrap(chapter_id="manga-gu881388",chapter=334)


	# with open("./temp.html","w", encoding='utf-8') as f:

	# f.write(ul.prettify()) # Write each element prettified


	# pprint(DATA)