nomnomnonono's picture
Upload 8 files
a1e77ee
import argparse
import glob
import os
import shutil
import time
import urllib
import cv2
import mediapipe as mp
import pandas as pd
import requests
from bs4 import BeautifulSoup
from omegaconf import OmegaConf
class Scraper:
def __init__(self, config):
self.config = OmegaConf.load(config)
self.base_url = "https://hominis.media/person/"
if os.path.exists(self.config.path_csv):
self.df = pd.read_csv(self.config.path_csv)
self.idx = len(self.df)
else:
self.df = pd.DataFrame([], columns=["filepath", "name", "url"])
self.idx = 0
os.makedirs(self.config.path_data, exist_ok=True)
os.makedirs(self.config.path_garbage, exist_ok=True)
def run(self):
html = requests.get(self.base_url, timeout=5)
soup = BeautifulSoup(html.content, "html.parser")
pages = soup.find_all("input", class_="selectButton")
before = 0
for page in pages:
url = self.base_url + page.get("onclick").split("'")[1].replace(
"/person/", ""
)
html = requests.get(url, timeout=5)
soup = BeautifulSoup(html.content, "html.parser")
people = soup.find_all("li", class_="card people")
for person in people:
name = person.find("p", class_="name").text
img_url = (
person.find("p", class_="thumbnail")
.get("style")
.replace("background-image:url('", "")
.replace("');", "")
)
img_path = os.path.join(self.config.path_data, name + ".png")
if os.path.exists(img_path):
continue
try:
urllib.request.urlretrieve(img_url, img_path)
self.df.loc[self.idx] = {
"filepath": img_path,
"name": name,
"url": img_url,
}
self.idx += 1
time.sleep(1)
except Exception:
continue
imgs = glob.glob(os.path.join(self.config.path_data, "*.png"))
assert len(imgs) == len(self.df)
print(f"Get {len(imgs) - before} images")
before = len(imgs)
self.df.to_csv(self.config.path_csv, index=False)
def post_processing(self):
mp_face_mesh = mp.solutions.face_mesh
with mp_face_mesh.FaceMesh(
static_image_mode=True,
max_num_faces=10,
refine_landmarks=True,
min_detection_confidence=0.5,
) as face_mesh:
for file in glob.glob(os.path.join(self.config.path_data, "*.png")):
image = cv2.imread(file)
results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
if not results.multi_face_landmarks:
shutil.move(
file,
os.path.join(self.config.path_garbage, os.path.split(file)[-1]),
)
if len(results.multi_face_landmarks) > 1:
shutil.move(
file,
os.path.join(self.config.path_garbage, os.path.split(file)[-1]),
)
idx = []
for path in glob.glob(os.path.join(self.config.path_garbage, "*.png")):
idx.append(
self.df[
self.df["filepath"]
== os.path.join(self.config.path_data, os.path.split(path)[-1])
].index.values[0]
)
self.df = self.df.drop(idx)
assert len(glob.glob(os.path.join(self.config.path_data, "*.png"))) == len(
self.df
)
self.df.to_csv(self.config.path_csv, index=False)
def argparser():
parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
"--config",
type=str,
default="config.yaml",
help="File path for config file.",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = argparser()
scraper = Scraper(args.config)
scraper.run()
scraper.post_processing()