import argparse import glob import os import shutil import time import urllib import cv2 import mediapipe as mp import pandas as pd import requests from bs4 import BeautifulSoup from omegaconf import OmegaConf class Scraper: def __init__(self, config): self.config = OmegaConf.load(config) self.base_url = "https://hominis.media/person/" if os.path.exists(self.config.path_csv): self.df = pd.read_csv(self.config.path_csv) self.idx = len(self.df) else: self.df = pd.DataFrame([], columns=["filepath", "name", "url"]) self.idx = 0 os.makedirs(self.config.path_data, exist_ok=True) os.makedirs(self.config.path_garbage, exist_ok=True) def run(self): html = requests.get(self.base_url, timeout=5) soup = BeautifulSoup(html.content, "html.parser") pages = soup.find_all("input", class_="selectButton") before = 0 for page in pages: url = self.base_url + page.get("onclick").split("'")[1].replace( "/person/", "" ) html = requests.get(url, timeout=5) soup = BeautifulSoup(html.content, "html.parser") people = soup.find_all("li", class_="card people") for person in people: name = person.find("p", class_="name").text img_url = ( person.find("p", class_="thumbnail") .get("style") .replace("background-image:url('", "") .replace("');", "") ) img_path = os.path.join(self.config.path_data, name + ".png") if os.path.exists(img_path): continue try: urllib.request.urlretrieve(img_url, img_path) self.df.loc[self.idx] = { "filepath": img_path, "name": name, "url": img_url, } self.idx += 1 time.sleep(1) except Exception: continue imgs = glob.glob(os.path.join(self.config.path_data, "*.png")) assert len(imgs) == len(self.df) print(f"Get {len(imgs) - before} images") before = len(imgs) self.df.to_csv(self.config.path_csv, index=False) def post_processing(self): mp_face_mesh = mp.solutions.face_mesh with mp_face_mesh.FaceMesh( static_image_mode=True, max_num_faces=10, refine_landmarks=True, min_detection_confidence=0.5, ) as face_mesh: for file in glob.glob(os.path.join(self.config.path_data, "*.png")): image = cv2.imread(file) results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) if not results.multi_face_landmarks: shutil.move( file, os.path.join(self.config.path_garbage, os.path.split(file)[-1]), ) if len(results.multi_face_landmarks) > 1: shutil.move( file, os.path.join(self.config.path_garbage, os.path.split(file)[-1]), ) idx = [] for path in glob.glob(os.path.join(self.config.path_garbage, "*.png")): idx.append( self.df[ self.df["filepath"] == os.path.join(self.config.path_data, os.path.split(path)[-1]) ].index.values[0] ) self.df = self.df.drop(idx) assert len(glob.glob(os.path.join(self.config.path_data, "*.png"))) == len( self.df ) self.df.to_csv(self.config.path_csv, index=False) def argparser(): parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", type=str, default="config.yaml", help="File path for config file.", ) args = parser.parse_args() return args if __name__ == "__main__": args = argparser() scraper = Scraper(args.config) scraper.run() scraper.post_processing()