""" Fetch Markdown files from specified GitHub repositories. This script fetches Markdown (.md), MDX (.mdx), and Jupyter Notebook (.ipynb) files from specified GitHub repositories, particularly focusing on documentation sources for various AI and machine learning libraries. Key features: 1. Configurable for multiple documentation sources (e.g., Hugging Face Transformers, PEFT, TRL) 2. Command-line interface for specifying one or more sources to process 3. Automatic conversion of Jupyter Notebooks to Markdown 4. Rate limiting handling to comply with GitHub API restrictions 5. Retry mechanism for resilience against network issues Usage: python github_to_markdown_ai_docs.py [ ...] Where is one of the predefined sources in SOURCE_CONFIGS (e.g., 'transformers', 'peft', 'trl'). Example: python github_to_markdown_ai_docs.py trl peft This will download and process the documentation files for both TRL and PEFT libraries. Note: - Ensure you have set the GITHUB_TOKEN variable with your GitHub Personal Access Token. - The script creates a 'data' directory in the current working directory to store the downloaded files. - Each source's files are stored in a subdirectory named '_md_files'. """ import argparse import json import os import random import time from typing import Dict, List import nbformat import requests from dotenv import load_dotenv from nbconvert import MarkdownExporter load_dotenv() # Configuration for different sources SOURCE_CONFIGS = { "transformers": { "owner": "huggingface", "repo": "transformers", "path": "docs/source/en", }, "peft": { "owner": "huggingface", "repo": "peft", "path": "docs/source", }, "trl": { "owner": "huggingface", "repo": "trl", "path": "docs/source", }, "llama_index": { "owner": "run-llama", "repo": "llama_index", "path": "docs/docs", }, "openai_cookbooks": { "owner": "openai", "repo": "openai-cookbook", "path": "examples", }, "langchain": { "owner": "langchain-ai", "repo": "langchain", "path": "docs/docs", }, } # GitHub Personal Access Token (replace with your own token) GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") # Headers for authenticated requests HEADERS = { "Authorization": f"token {GITHUB_TOKEN}", "Accept": "application/vnd.github.v3+json", } # Maximum number of retries MAX_RETRIES = 5 def check_rate_limit(): rate_limit_url = "https://api.github.com/rate_limit" response = requests.get(rate_limit_url, headers=HEADERS) data = response.json() remaining = data["resources"]["core"]["remaining"] reset_time = data["resources"]["core"]["reset"] if remaining < 10: # Adjust this threshold as needed wait_time = reset_time - time.time() print(f"Rate limit nearly exceeded. Waiting for {wait_time:.2f} seconds.") time.sleep(wait_time + 1) # Add 1 second buffer def get_files_in_directory(api_url: str, retries: int = 0) -> List[Dict]: try: check_rate_limit() response = requests.get(api_url, headers=HEADERS) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: if retries < MAX_RETRIES: wait_time = (2**retries) + random.random() print( f"Error fetching directory contents: {e}. Retrying in {wait_time:.2f} seconds..." ) time.sleep(wait_time) return get_files_in_directory(api_url, retries + 1) else: print( f"Failed to fetch directory contents after {MAX_RETRIES} retries: {e}" ) return [] def download_file(file_url: str, file_path: str, retries: int = 0): try: check_rate_limit() response = requests.get(file_url, headers=HEADERS) response.raise_for_status() with open(file_path, "wb") as file: file.write(response.content) except requests.exceptions.RequestException as e: if retries < MAX_RETRIES: wait_time = (2**retries) + random.random() print( f"Error downloading file: {e}. Retrying in {wait_time:.2f} seconds..." ) time.sleep(wait_time) download_file(file_url, file_path, retries + 1) else: print(f"Failed to download file after {MAX_RETRIES} retries: {e}") # def convert_ipynb_to_md(ipynb_path: str, md_path: str): # with open(ipynb_path, "r", encoding="utf-8") as f: # notebook = nbformat.read(f, as_version=4) # exporter = MarkdownExporter() # markdown, _ = exporter.from_notebook_node(notebook) # with open(md_path, "w", encoding="utf-8") as f: # f.write(markdown) def convert_ipynb_to_md(ipynb_path: str, md_path: str): try: with open(ipynb_path, "r", encoding="utf-8") as f: notebook = nbformat.read(f, as_version=4) exporter = MarkdownExporter() markdown, _ = exporter.from_notebook_node(notebook) with open(md_path, "w", encoding="utf-8") as f: f.write(markdown) except (json.JSONDecodeError, nbformat.reader.NotJSONError) as e: print(f"Error converting notebook {ipynb_path}: {str(e)}") print("Skipping this file and continuing with others...") except Exception as e: print(f"Unexpected error converting notebook {ipynb_path}: {str(e)}") print("Skipping this file and continuing with others...") def fetch_files(api_url: str, local_dir: str): files = get_files_in_directory(api_url) for file in files: if file["type"] == "file" and file["name"].endswith((".md", ".mdx", ".ipynb")): file_url = file["download_url"] file_name = file["name"] file_path = os.path.join(local_dir, file_name) print(f"Downloading {file_name}...") download_file(file_url, file_path) if file_name.endswith(".ipynb"): md_file_name = file_name.replace(".ipynb", ".md") md_file_path = os.path.join(local_dir, md_file_name) print(f"Converting {file_name} to markdown...") convert_ipynb_to_md(file_path, md_file_path) os.remove(file_path) # Remove the .ipynb file after conversion elif file["type"] == "dir": subdir = os.path.join(local_dir, file["name"]) os.makedirs(subdir, exist_ok=True) fetch_files(file["url"], subdir) def process_source(source: str): if source not in SOURCE_CONFIGS: print( f"Error: Unknown source '{source}'. Available sources: {', '.join(SOURCE_CONFIGS.keys())}" ) return config = SOURCE_CONFIGS[source] api_url = f"https://api.github.com/repos/{config['owner']}/{config['repo']}/contents/{config['path']}" local_dir = f"data/{config['repo']}_md_files" os.makedirs(local_dir, exist_ok=True) print(f"Processing source: {source}") fetch_files(api_url, local_dir) print(f"Finished processing {source}") def main(sources: List[str]): for source in sources: process_source(source) print("All specified sources have been processed.") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Fetch Markdown files from specified GitHub repositories." ) parser.add_argument( "sources", nargs="+", choices=SOURCE_CONFIGS.keys(), help="Specify one or more sources to process", ) args = parser.parse_args() main(args.sources)