Spaces:
Sleeping
Sleeping
File size: 4,785 Bytes
947c08e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from bs4 import BeautifulSoup
from pprint import pprint
from ..utils import SeleniumScraper
from selenium.webdriver.common.by import By
from core.settings import BASE_DIR
from PIL import Image
from backend.module.utils import date_utils
import json,time, threading,os, uuid, sqlite3, io, base64, sys
MAX_TIMEOUT = 10
scraper = None
def __scrollToBottom(driver:object=None):
if not driver: raise ValueError("The 'driver' argument is required.")
timeout = 10
interval = [0]
def timer(interval,timeout):
while True:
time.sleep(1)
interval[0] = interval[0]+1
if interval[0] >= timeout: break
t = threading.Thread(target=timer, args=[interval,timeout])
t.daemon = True
t.start()
previous_height = 0
scrolledY = 0
while True:
# Scroll to the bottom of the page
driver.execute_script(f"window.scrollBy(0, {scrolledY});")
time.sleep(0.25)
current_height = driver.execute_script("return document.documentElement.scrollHeight")
if current_height > previous_height:
previous_height = current_height
interval[0] = 0
else:
parent_div = driver.find_element(By.CLASS_NAME, "mh_mangalist")
child_elements = parent_div.find_elements(By.XPATH, "./*")
if child_elements[-1].tag_name != 'script' and child_elements[-1].get_attribute('text') != '__cad.read_periodical();':
if interval[0] >= timeout: break
else: interval[0] = 0
scrolledY += 50
def scrap(comic_id:str="",chapter_id:str="",output_dir:str=""):
if not comic_id: raise ValueError("The 'comic_id' parameter is required.")
if not chapter_id: raise ValueError("The 'chapter_id' parameter is required.")
if not output_dir: raise ValueError("The 'output_dir' parameter is required.")
global scraper
try:
url = f"https://www.colamanga.com/{chapter_id}"
if not scraper: scraper = SeleniumScraper()
driver = scraper.driver()
driver.get(url)
__scrollToBottom(driver=driver)
parent_element = driver.find_element(By.ID, "mangalist")
child_list = parent_element.find_elements(By.CLASS_NAME, "mh_comicpic")
blob_list = []
for child in child_list:
image_element = child.find_element(By.TAG_NAME, "img")
url = image_element.get_attribute("src")
if not url: continue
if url.split(":")[0] == "blob":
timeout = 0
while True:
if timeout >= MAX_TIMEOUT: raise Exception('#1 Timed out!')
is_image_loaded = driver.execute_script(
"return arguments[0].complete",
image_element
)
if is_image_loaded: break
timeout += 1
time.sleep(1)
blob_list.append(url)
def process_browser_log_entry(entry):
response = json.loads(entry['message'])['message']
return response
browser_log = driver.get_log('performance')
events = [process_browser_log_entry(entry) for entry in browser_log]
events = [event for event in events if 'Network.response' in event['method']]
for e in events:
if e.get("params").get("type") == "Image":
url = e.get("params").get("response").get("url")
if url.split(":")[0] == "blob":
request_id = e["params"]["requestId"]
response = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
img = Image.open(io.BytesIO(base64.decodebytes(bytes(response.get("body"), "utf-8"))))
chapter_id = chapter_id.split("/")[-1].split(".")[0]
dir = os.path.join(output_dir)
os.makedirs(dir, exist_ok=True)
img.save(os.path.join(dir,f"{blob_list.index(url)}.png"))
return {"status":"success"}
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
line_number = exc_tb.tb_lineno
print(f"Error on line {line_number}: {e}")
raise Exception(e)
finally: pass
if __name__ == "__main__":
DATA = scrap(chapter_id="manga-gu881388",chapter=334)
# with open("./temp.html","w", encoding='utf-8') as f:
# f.write(ul.prettify()) # Write each element prettified
# pprint(DATA) |