Spaces:

dwb2023
/

hf_extractor

Runtime error

File size: 2,978 Bytes

import os
import subprocess
import gradio as gr

def clone_repo(url, repo_dir):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path):
    size = os.path.getsize(file_path)
    file_type = "binary" if size > 1024 * 1024 else "text"
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
    }

def read_file_content(file_path):
    with open(file_path, "r") as file:
        return file.read()

def extract_repo_content(url):
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": error}]
    
    extracted_content = []
    for root, _, files in os.walk(repo_dir):
        for file in files:
            file_path = os.path.join(root, file)
            file_summary = get_file_summary(file_path)
            content = {"header": file_summary}
            
            if file_summary["type"] == "text" and file_summary["size"] <= 1024 * 1024:
                try:
                    content["content"] = read_file_content(file_path)
                except Exception as e:
                    content["content"] = f"Failed to read file content: {str(e)}"
            else:
                content["content"] = "File too large or binary, content not captured."
            
            extracted_content.append(content)
    
    return extracted_content

def format_output(extracted_content):
    formatted_output = ""
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url)
    formatted_output = format_output(extracted_content)
    return formatted_output

app = gr.Blocks()

with app:
    gr.Markdown("# Gradio Space/Model Content Extractor")
    url_input = gr.Textbox(label="Hugging Face Space/Model URL")
    output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()