Spaces:
Build error
Build error
| import requests | |
| from bs4 import BeautifulSoup | |
| import subprocess | |
| from selenium import webdriver | |
| from selenium.webdriver.common.keys import Keys | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.chrome.service import Service | |
| from bs4 import BeautifulSoup | |
| import time | |
| from tqdm import tqdm | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| def get_page_content(url,driver): | |
| driver.get(url) | |
| html_content = driver.page_source | |
| return html_content | |
| def parse_album_links(content): | |
| soup = BeautifulSoup(content, 'html.parser') | |
| album_elements = soup.find_all('a', class_='album-cover') | |
| album_links = [element['href'] for element in album_elements] | |
| return album_links | |
| def get_all_album_links(base_url, num_pages): | |
| service = Service(executable_path=ChromeDriverManager().install()) | |
| op = webdriver.ChromeOptions() | |
| # op.add_argument('headless') | |
| op.add_argument("--log-level=3") | |
| driver = webdriver.Chrome(options=op,service=service) | |
| album_links = [] | |
| page_url = base_url | |
| page_count = 1 | |
| while page_url and page_count < num_pages: | |
| content = get_page_content(page_url,driver) | |
| links = parse_album_links(content) | |
| album_links.extend(links) | |
| next_page_link = parse_next_page_link(content, page_count+1) | |
| if next_page_link: | |
| page_url = f"https://www.ximalaya.com{next_page_link}" | |
| else: | |
| page_url = None | |
| page_count += 1 | |
| return album_links | |
| def parse_next_page_link(content, page_count): | |
| soup = BeautifulSoup(content, 'html.parser') | |
| next_page_element = soup.find('a', class_='page-link', text=str(page_count)) | |
| if next_page_element: | |
| return next_page_element['href'] | |
| return None | |
| def download_album_audio(album_id): | |
| out_path = 'ttts/spider/xmly' | |
| command = f'xmlyfetcher -o {out_path} {album_id} all ' | |
| subprocess.run(command, shell=True) | |
| def download_all_albums(album_links): | |
| for link in tqdm(album_links): | |
| print(link) | |
| album_id = link.split('/')[-1] | |
| download_album_audio(album_id) | |
| def save_links_to_file(links, file_path): | |
| with open(file_path, 'w') as file: | |
| for link in links: | |
| file.write(link + '\n') | |
| def read_links_from_file(file_path): | |
| links = [] | |
| with open(file_path, 'r') as file: | |
| for line in file: | |
| link = line.strip() # 去除行末尾的换行符 | |
| links.append(link) | |
| return links | |
| if __name__ == '__main__': | |
| # base_url = 'https://www.ximalaya.com/category/a1001' | |
| # num_pages = 50 | |
| # album_links = get_all_album_links(base_url, num_pages) | |
| # save_links_to_file(album_links, 'album_links.txt') | |
| album_links = read_links_from_file('ttts/spider/album_links.txt') | |
| download_all_albums(album_links) |