Spaces:
Runtime error
Runtime error
| import argparse | |
| import glob | |
| import os | |
| import shutil | |
| import time | |
| import urllib | |
| import cv2 | |
| import mediapipe as mp | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from omegaconf import OmegaConf | |
| class Scraper: | |
| def __init__(self, config): | |
| self.config = OmegaConf.load(config) | |
| self.base_url = "https://hominis.media/person/" | |
| if os.path.exists(self.config.path_csv): | |
| self.df = pd.read_csv(self.config.path_csv) | |
| self.idx = len(self.df) | |
| else: | |
| self.df = pd.DataFrame([], columns=["filepath", "name", "url"]) | |
| self.idx = 0 | |
| os.makedirs(self.config.path_data, exist_ok=True) | |
| os.makedirs(self.config.path_garbage, exist_ok=True) | |
| def run(self): | |
| html = requests.get(self.base_url, timeout=5) | |
| soup = BeautifulSoup(html.content, "html.parser") | |
| pages = soup.find_all("input", class_="selectButton") | |
| before = 0 | |
| for page in pages: | |
| url = self.base_url + page.get("onclick").split("'")[1].replace( | |
| "/person/", "" | |
| ) | |
| html = requests.get(url, timeout=5) | |
| soup = BeautifulSoup(html.content, "html.parser") | |
| people = soup.find_all("li", class_="card people") | |
| for person in people: | |
| name = person.find("p", class_="name").text | |
| img_url = ( | |
| person.find("p", class_="thumbnail") | |
| .get("style") | |
| .replace("background-image:url('", "") | |
| .replace("');", "") | |
| ) | |
| img_path = os.path.join(self.config.path_data, name + ".png") | |
| if os.path.exists(img_path): | |
| continue | |
| try: | |
| urllib.request.urlretrieve(img_url, img_path) | |
| self.df.loc[self.idx] = { | |
| "filepath": img_path, | |
| "name": name, | |
| "url": img_url, | |
| } | |
| self.idx += 1 | |
| time.sleep(1) | |
| except Exception: | |
| continue | |
| imgs = glob.glob(os.path.join(self.config.path_data, "*.png")) | |
| assert len(imgs) == len(self.df) | |
| print(f"Get {len(imgs) - before} images") | |
| before = len(imgs) | |
| self.df.to_csv(self.config.path_csv, index=False) | |
| def post_processing(self): | |
| mp_face_mesh = mp.solutions.face_mesh | |
| with mp_face_mesh.FaceMesh( | |
| static_image_mode=True, | |
| max_num_faces=10, | |
| refine_landmarks=True, | |
| min_detection_confidence=0.5, | |
| ) as face_mesh: | |
| for file in glob.glob(os.path.join(self.config.path_data, "*.png")): | |
| image = cv2.imread(file) | |
| results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) | |
| if not results.multi_face_landmarks: | |
| shutil.move( | |
| file, | |
| os.path.join(self.config.path_garbage, os.path.split(file)[-1]), | |
| ) | |
| if len(results.multi_face_landmarks) > 1: | |
| shutil.move( | |
| file, | |
| os.path.join(self.config.path_garbage, os.path.split(file)[-1]), | |
| ) | |
| idx = [] | |
| for path in glob.glob(os.path.join(self.config.path_garbage, "*.png")): | |
| idx.append( | |
| self.df[ | |
| self.df["filepath"] | |
| == os.path.join(self.config.path_data, os.path.split(path)[-1]) | |
| ].index.values[0] | |
| ) | |
| self.df = self.df.drop(idx) | |
| assert len(glob.glob(os.path.join(self.config.path_data, "*.png"))) == len( | |
| self.df | |
| ) | |
| self.df.to_csv(self.config.path_csv, index=False) | |
| def argparser(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "-c", | |
| "--config", | |
| type=str, | |
| default="config.yaml", | |
| help="File path for config file.", | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| if __name__ == "__main__": | |
| args = argparser() | |
| scraper = Scraper(args.config) | |
| scraper.run() | |
| scraper.post_processing() | |