Spaces:
Runtime error
Runtime error
File size: 4,299 Bytes
a1e77ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import argparse
import glob
import os
import shutil
import time
import urllib
import cv2
import mediapipe as mp
import pandas as pd
import requests
from bs4 import BeautifulSoup
from omegaconf import OmegaConf
class Scraper:
def __init__(self, config):
self.config = OmegaConf.load(config)
self.base_url = "https://hominis.media/person/"
if os.path.exists(self.config.path_csv):
self.df = pd.read_csv(self.config.path_csv)
self.idx = len(self.df)
else:
self.df = pd.DataFrame([], columns=["filepath", "name", "url"])
self.idx = 0
os.makedirs(self.config.path_data, exist_ok=True)
os.makedirs(self.config.path_garbage, exist_ok=True)
def run(self):
html = requests.get(self.base_url, timeout=5)
soup = BeautifulSoup(html.content, "html.parser")
pages = soup.find_all("input", class_="selectButton")
before = 0
for page in pages:
url = self.base_url + page.get("onclick").split("'")[1].replace(
"/person/", ""
)
html = requests.get(url, timeout=5)
soup = BeautifulSoup(html.content, "html.parser")
people = soup.find_all("li", class_="card people")
for person in people:
name = person.find("p", class_="name").text
img_url = (
person.find("p", class_="thumbnail")
.get("style")
.replace("background-image:url('", "")
.replace("');", "")
)
img_path = os.path.join(self.config.path_data, name + ".png")
if os.path.exists(img_path):
continue
try:
urllib.request.urlretrieve(img_url, img_path)
self.df.loc[self.idx] = {
"filepath": img_path,
"name": name,
"url": img_url,
}
self.idx += 1
time.sleep(1)
except Exception:
continue
imgs = glob.glob(os.path.join(self.config.path_data, "*.png"))
assert len(imgs) == len(self.df)
print(f"Get {len(imgs) - before} images")
before = len(imgs)
self.df.to_csv(self.config.path_csv, index=False)
def post_processing(self):
mp_face_mesh = mp.solutions.face_mesh
with mp_face_mesh.FaceMesh(
static_image_mode=True,
max_num_faces=10,
refine_landmarks=True,
min_detection_confidence=0.5,
) as face_mesh:
for file in glob.glob(os.path.join(self.config.path_data, "*.png")):
image = cv2.imread(file)
results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
if not results.multi_face_landmarks:
shutil.move(
file,
os.path.join(self.config.path_garbage, os.path.split(file)[-1]),
)
if len(results.multi_face_landmarks) > 1:
shutil.move(
file,
os.path.join(self.config.path_garbage, os.path.split(file)[-1]),
)
idx = []
for path in glob.glob(os.path.join(self.config.path_garbage, "*.png")):
idx.append(
self.df[
self.df["filepath"]
== os.path.join(self.config.path_data, os.path.split(path)[-1])
].index.values[0]
)
self.df = self.df.drop(idx)
assert len(glob.glob(os.path.join(self.config.path_data, "*.png"))) == len(
self.df
)
self.df.to_csv(self.config.path_csv, index=False)
def argparser():
parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
"--config",
type=str,
default="config.yaml",
help="File path for config file.",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = argparser()
scraper = Scraper(args.config)
scraper.run()
scraper.post_processing()
|