Spaces:

mrfakename
/

TTTS

Build error

App Files Files Community

TTTS / ttts /spider /xmly_spider.py

mrfakename

Add source code

4ee33aa over 1 year ago

raw

history blame

2.8 kB

	import requests
	from bs4 import BeautifulSoup
	import subprocess
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	from bs4 import BeautifulSoup
	import time
	from tqdm import tqdm
	from webdriver_manager.chrome import ChromeDriverManager


	def get_page_content(url,driver):
	driver.get(url)
	html_content = driver.page_source
	return html_content

	def parse_album_links(content):
	soup = BeautifulSoup(content, 'html.parser')
	album_elements = soup.find_all('a', class_='album-cover')
	album_links = [element['href'] for element in album_elements]
	return album_links

	def get_all_album_links(base_url, num_pages):
	service = Service(executable_path=ChromeDriverManager().install())

	op = webdriver.ChromeOptions()
	# op.add_argument('headless')
	op.add_argument("--log-level=3")
	driver = webdriver.Chrome(options=op,service=service)
	album_links = []

	page_url = base_url
	page_count = 1
	while page_url and page_count < num_pages:
	content = get_page_content(page_url,driver)
	links = parse_album_links(content)
	album_links.extend(links)

	next_page_link = parse_next_page_link(content, page_count+1)
	if next_page_link:
	page_url = f"https://www.ximalaya.com{next_page_link}"
	else:
	page_url = None

	page_count += 1

	return album_links

	def parse_next_page_link(content, page_count):
	soup = BeautifulSoup(content, 'html.parser')
	next_page_element = soup.find('a', class_='page-link', text=str(page_count))
	if next_page_element:
	return next_page_element['href']
	return None

	def download_album_audio(album_id):
	out_path = 'ttts/spider/xmly'
	command = f'xmlyfetcher -o {out_path} {album_id} all '
	subprocess.run(command, shell=True)

	def download_all_albums(album_links):
	for link in tqdm(album_links):
	print(link)
	album_id = link.split('/')[-1]
	download_album_audio(album_id)
	def save_links_to_file(links, file_path):
	with open(file_path, 'w') as file:
	for link in links:
	file.write(link + '\n')
	def read_links_from_file(file_path):
	links = []

	with open(file_path, 'r') as file:
	for line in file:
	link = line.strip() # 去除行末尾的换行符
	links.append(link)

	return links
	if __name__ == '__main__':
	# base_url = 'https://www.ximalaya.com/category/a1001'
	# num_pages = 50

	# album_links = get_all_album_links(base_url, num_pages)
	# save_links_to_file(album_links, 'album_links.txt')
	album_links = read_links_from_file('ttts/spider/album_links.txt')
	download_all_albums(album_links)