Spaces:

nomnomnonono
/

Smilar-Skeletal-Person-Estimation

Runtime error

App Files Files

Smilar-Skeletal-Person-Estimation / src /scrape.py

nomnomnonono's picture

Upload 8 files

a1e77ee over 2 years ago

history blame contribute delete

4.3 kB

	import argparse
	import glob
	import os
	import shutil
	import time
	import urllib

	import cv2
	import mediapipe as mp
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from omegaconf import OmegaConf


	class Scraper:
	def __init__(self, config):
	self.config = OmegaConf.load(config)
	self.base_url = "https://hominis.media/person/"
	if os.path.exists(self.config.path_csv):
	self.df = pd.read_csv(self.config.path_csv)
	self.idx = len(self.df)
	else:
	self.df = pd.DataFrame([], columns=["filepath", "name", "url"])
	self.idx = 0
	os.makedirs(self.config.path_data, exist_ok=True)
	os.makedirs(self.config.path_garbage, exist_ok=True)

	def run(self):
	html = requests.get(self.base_url, timeout=5)
	soup = BeautifulSoup(html.content, "html.parser")
	pages = soup.find_all("input", class_="selectButton")
	before = 0

	for page in pages:
	url = self.base_url + page.get("onclick").split("'")[1].replace(
	"/person/", ""
	)
	html = requests.get(url, timeout=5)
	soup = BeautifulSoup(html.content, "html.parser")
	people = soup.find_all("li", class_="card people")
	for person in people:
	name = person.find("p", class_="name").text
	img_url = (
	person.find("p", class_="thumbnail")
	.get("style")
	.replace("background-image:url('", "")
	.replace("');", "")
	)
	img_path = os.path.join(self.config.path_data, name + ".png")
	if os.path.exists(img_path):
	continue
	try:
	urllib.request.urlretrieve(img_url, img_path)
	self.df.loc[self.idx] = {
	"filepath": img_path,
	"name": name,
	"url": img_url,
	}
	self.idx += 1
	time.sleep(1)
	except Exception:
	continue

	imgs = glob.glob(os.path.join(self.config.path_data, "*.png"))
	assert len(imgs) == len(self.df)
	print(f"Get {len(imgs) - before} images")
	before = len(imgs)

	self.df.to_csv(self.config.path_csv, index=False)

	def post_processing(self):
	mp_face_mesh = mp.solutions.face_mesh
	with mp_face_mesh.FaceMesh(
	static_image_mode=True,
	max_num_faces=10,
	refine_landmarks=True,
	min_detection_confidence=0.5,
	) as face_mesh:
	for file in glob.glob(os.path.join(self.config.path_data, "*.png")):
	image = cv2.imread(file)
	results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
	if not results.multi_face_landmarks:
	shutil.move(
	file,
	os.path.join(self.config.path_garbage, os.path.split(file)[-1]),
	)
	if len(results.multi_face_landmarks) > 1:
	shutil.move(
	file,
	os.path.join(self.config.path_garbage, os.path.split(file)[-1]),
	)

	idx = []
	for path in glob.glob(os.path.join(self.config.path_garbage, "*.png")):
	idx.append(
	self.df[
	self.df["filepath"]
	== os.path.join(self.config.path_data, os.path.split(path)[-1])
	].index.values[0]
	)
	self.df = self.df.drop(idx)
	assert len(glob.glob(os.path.join(self.config.path_data, "*.png"))) == len(
	self.df
	)
	self.df.to_csv(self.config.path_csv, index=False)


	def argparser():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-c",
	"--config",
	type=str,
	default="config.yaml",
	help="File path for config file.",
	)
	args = parser.parse_args()
	return args


	if __name__ == "__main__":
	args = argparser()
	scraper = Scraper(args.config)
	scraper.run()
	scraper.post_processing()