Spaces:

hugging2021
/

ragtag4

Runtime error

App Files Files Community

ragtag4 / screenplays /downloader.py

hugging2021

Upload folder using huggingface_hub

79c7b05 verified 2 days ago

raw

history blame contribute delete

2.07 kB

	import os
	import sys
	import requests
	from bs4 import BeautifulSoup
	import argparse

	# Default URL of the script to download
	DEFAULT_SCRIPT_URL = "https://www.dailyscript.com/scripts/twelve_monkeys.html"

	# Directory to save the downloaded script
	save_dir = "scripts"
	os.makedirs(save_dir, exist_ok=True)

	# Function to download a script
	def download_script(script_url, save_dir):
	response = requests.get(script_url)
	response.raise_for_status()

	# Parse the HTML content
	soup = BeautifulSoup(response.text, "html.parser")

	# Try to find the <pre> tag
	pre_tag = soup.find("pre")

	if pre_tag:
	# Extract the text content from the <pre> tag
	text_content = pre_tag.get_text()
	# Remove the line numbers
	lines = text_content.split("\n")
	cleaned_lines = [line.split("\|", 1)[-1] for line in lines]
	cleaned_text = "\n".join(cleaned_lines)
	else:
	# If no <pre> tag, get the text from the body
	body = soup.find("body")
	if body:
	cleaned_text = body.get_text()
	else:
	raise ValueError("Could not find script content in the HTML")

	# Extract the filename from the URL and change the extension to .txt
	filename = os.path.basename(script_url).split(".")[0] + ".txt"
	save_path = os.path.join(save_dir, filename)

	with open(save_path, "w", encoding="utf-8") as file:
	file.write(cleaned_text)

	print(f"Downloaded: {save_path}")
	return save_path

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Download a screenplay from a given URL.")
	parser.add_argument("--url", type=str, default=DEFAULT_SCRIPT_URL,
	help="URL of the screenplay to download (default: Twelve Monkeys)")

	args = parser.parse_args()
	script_url = args.url

	# Extract the script name from the URL
	script_name = os.path.basename(script_url).split(".")[0] + ".txt"
	save_path = os.path.join(save_dir, script_name)

	# Download the script
	download_script(script_url, save_dir)