Spaces:

dwb2023
/

hf_extractor

Runtime error

File size: 5,703 Bytes

98d1d12
 
4006c1a
bcc36e8
 
5e040c2
bcc36e8
 
 
 
 
 
 
 
 
56dbc6b
e33200c
 
 
 
c700267
98d1d12
 
c700267
e361a15
98d1d12
 
 
 
1ca0012
98d1d12
4006c1a
98d1d12
 
 
56dbc6b
 
4006c1a
 
56dbc6b
bcc36e8
56dbc6b
 
 
 
98d1d12
bcc36e8
0f701bd
 
 
bcc36e8
0f701bd
 
 
 
 
 
 
bcc36e8
0f701bd
 
 
 
bcc36e8
e33200c
 
 
bcc36e8
 
 
 
c700267
98d1d12
e33200c
4006c1a
bcc36e8
 
 
 
 
0f701bd
bcc36e8
 
 
 
 
 
 
 
 
4006c1a
e33200c
 
4006c1a
 
fce2161
 
4006c1a
bbd42f8
 
 
 
56dbc6b
 
bbd42f8
 
 
 
4006c1a
 
bcc36e8
 
fce2161
4006c1a
 
bcc36e8
4006c1a
 
994ffd2
8537faa
 
 
 
84ac83a
8537faa
 
 
 
 
bcc36e8
4006c1a
 
bcc36e8
4006c1a
56dbc6b

import os
import subprocess
import gradio as gr
from magika import Magika
from huggingface_hub import login

# Get the HF token and space author name from environment variables
hf_token = os.getenv("HF_TOKEN")
hf_user = os.getenv("SPACE_AUTHOR_NAME")

if not hf_token:
    raise ValueError("HF_TOKEN environment variable is not set")
if not hf_user:
    raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")

SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]

def validate_url(url):
    return url.startswith('https://')

def clone_repo(url, repo_dir, hf_token, hf_user):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
    result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
        "creation_date": os.path.getctime(file_path),
        "modification_date": os.path.getmtime(file_path)
    }

def read_file_content(file_path, max_size=32*1024):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        if os.path.getsize(file_path) > max_size:
            return file.read(max_size) + "\n... [Content Truncated] ..."
        else:
            return file.read()

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def extract_repo_content(url, hf_token, hf_user):
    if not validate_url(url):
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
    
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir, hf_token, hf_user)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
    
    file_types = validate_file_types(repo_dir)
    extracted_content = []
    for file_path, file_type in file_types.items():
        file_summary = get_file_summary(file_path, file_type)
        content = {"header": file_summary}
        
        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
            try:
                content["content"] = read_file_content(file_path)
            except Exception as e:
                content["content"] = f"Failed to read file content: {str(e)}"
        else:
            content["content"] = "File too large or binary, content not captured."
        
        extracted_content.append(content)
    
    subprocess.run(["rm", "-rf", repo_dir])
    
    return extracted_content

def format_output(extracted_content, repo_url):
    formatted_output = f"# Repository URL: {repo_url}\n\n"
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += f"**Created:** {file_data['header']['creation_date']}\n"
            formatted_output += f"**Modified:** {file_data['header']['modification_date']}\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url, hf_token, hf_user)
    formatted_output = format_output(extracted_content, url)
    return formatted_output

app = gr.Blocks(theme="sudeepshouche/minimalist")

with app:
    gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
    url_input = gr.Textbox(label="https:// URL of Repository", placeholder="Enter the repository URL here OR select an example below...")
    url_examples = gr.Examples(
        examples=[
            ["https://huggingface.co/spaces/big-vision/paligemma-hf"],
            ["https://huggingface.co/google/paligemma-3b-mix-224"],
            ["https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"],
            ["https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"]
        ],
        inputs=url_input
    )
    output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()