Spaces:
Runtime error
Runtime error
import argparse | |
import glob | |
import os | |
import shutil | |
import time | |
import urllib | |
import cv2 | |
import mediapipe as mp | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from omegaconf import OmegaConf | |
class Scraper: | |
def __init__(self, config): | |
self.config = OmegaConf.load(config) | |
self.base_url = "https://hominis.media/person/" | |
if os.path.exists(self.config.path_csv): | |
self.df = pd.read_csv(self.config.path_csv) | |
self.idx = len(self.df) | |
else: | |
self.df = pd.DataFrame([], columns=["filepath", "name", "url"]) | |
self.idx = 0 | |
os.makedirs(self.config.path_data, exist_ok=True) | |
os.makedirs(self.config.path_garbage, exist_ok=True) | |
def run(self): | |
html = requests.get(self.base_url, timeout=5) | |
soup = BeautifulSoup(html.content, "html.parser") | |
pages = soup.find_all("input", class_="selectButton") | |
before = 0 | |
for page in pages: | |
url = self.base_url + page.get("onclick").split("'")[1].replace( | |
"/person/", "" | |
) | |
html = requests.get(url, timeout=5) | |
soup = BeautifulSoup(html.content, "html.parser") | |
people = soup.find_all("li", class_="card people") | |
for person in people: | |
name = person.find("p", class_="name").text | |
img_url = ( | |
person.find("p", class_="thumbnail") | |
.get("style") | |
.replace("background-image:url('", "") | |
.replace("');", "") | |
) | |
img_path = os.path.join(self.config.path_data, name + ".png") | |
if os.path.exists(img_path): | |
continue | |
try: | |
urllib.request.urlretrieve(img_url, img_path) | |
self.df.loc[self.idx] = { | |
"filepath": img_path, | |
"name": name, | |
"url": img_url, | |
} | |
self.idx += 1 | |
time.sleep(1) | |
except Exception: | |
continue | |
imgs = glob.glob(os.path.join(self.config.path_data, "*.png")) | |
assert len(imgs) == len(self.df) | |
print(f"Get {len(imgs) - before} images") | |
before = len(imgs) | |
self.df.to_csv(self.config.path_csv, index=False) | |
def post_processing(self): | |
mp_face_mesh = mp.solutions.face_mesh | |
with mp_face_mesh.FaceMesh( | |
static_image_mode=True, | |
max_num_faces=10, | |
refine_landmarks=True, | |
min_detection_confidence=0.5, | |
) as face_mesh: | |
for file in glob.glob(os.path.join(self.config.path_data, "*.png")): | |
image = cv2.imread(file) | |
results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) | |
if not results.multi_face_landmarks: | |
shutil.move( | |
file, | |
os.path.join(self.config.path_garbage, os.path.split(file)[-1]), | |
) | |
if len(results.multi_face_landmarks) > 1: | |
shutil.move( | |
file, | |
os.path.join(self.config.path_garbage, os.path.split(file)[-1]), | |
) | |
idx = [] | |
for path in glob.glob(os.path.join(self.config.path_garbage, "*.png")): | |
idx.append( | |
self.df[ | |
self.df["filepath"] | |
== os.path.join(self.config.path_data, os.path.split(path)[-1]) | |
].index.values[0] | |
) | |
self.df = self.df.drop(idx) | |
assert len(glob.glob(os.path.join(self.config.path_data, "*.png"))) == len( | |
self.df | |
) | |
self.df.to_csv(self.config.path_csv, index=False) | |
def argparser(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-c", | |
"--config", | |
type=str, | |
default="config.yaml", | |
help="File path for config file.", | |
) | |
args = parser.parse_args() | |
return args | |
if __name__ == "__main__": | |
args = argparser() | |
scraper = Scraper(args.config) | |
scraper.run() | |
scraper.post_processing() | |