TTTS / ttts /spider /xmly_spider.py
mrfakename's picture
Add source code
4ee33aa
raw
history blame
2.8 kB
import requests
from bs4 import BeautifulSoup
import subprocess
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
def get_page_content(url,driver):
driver.get(url)
html_content = driver.page_source
return html_content
def parse_album_links(content):
soup = BeautifulSoup(content, 'html.parser')
album_elements = soup.find_all('a', class_='album-cover')
album_links = [element['href'] for element in album_elements]
return album_links
def get_all_album_links(base_url, num_pages):
service = Service(executable_path=ChromeDriverManager().install())
op = webdriver.ChromeOptions()
# op.add_argument('headless')
op.add_argument("--log-level=3")
driver = webdriver.Chrome(options=op,service=service)
album_links = []
page_url = base_url
page_count = 1
while page_url and page_count < num_pages:
content = get_page_content(page_url,driver)
links = parse_album_links(content)
album_links.extend(links)
next_page_link = parse_next_page_link(content, page_count+1)
if next_page_link:
page_url = f"https://www.ximalaya.com{next_page_link}"
else:
page_url = None
page_count += 1
return album_links
def parse_next_page_link(content, page_count):
soup = BeautifulSoup(content, 'html.parser')
next_page_element = soup.find('a', class_='page-link', text=str(page_count))
if next_page_element:
return next_page_element['href']
return None
def download_album_audio(album_id):
out_path = 'ttts/spider/xmly'
command = f'xmlyfetcher -o {out_path} {album_id} all '
subprocess.run(command, shell=True)
def download_all_albums(album_links):
for link in tqdm(album_links):
print(link)
album_id = link.split('/')[-1]
download_album_audio(album_id)
def save_links_to_file(links, file_path):
with open(file_path, 'w') as file:
for link in links:
file.write(link + '\n')
def read_links_from_file(file_path):
links = []
with open(file_path, 'r') as file:
for line in file:
link = line.strip() # 去除行末尾的换行符
links.append(link)
return links
if __name__ == '__main__':
# base_url = 'https://www.ximalaya.com/category/a1001'
# num_pages = 50
# album_links = get_all_album_links(base_url, num_pages)
# save_links_to_file(album_links, 'album_links.txt')
album_links = read_links_from_file('ttts/spider/album_links.txt')
download_all_albums(album_links)