File size: 3,590 Bytes
98d1d12
 
4006c1a
0f701bd
4006c1a
98d1d12
 
 
 
 
 
 
 
1ca0012
98d1d12
4006c1a
98d1d12
 
 
4006c1a
 
98d1d12
6be117b
98d1d12
 
0f701bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4006c1a
98d1d12
 
 
4006c1a
98d1d12
 
 
4006c1a
0f701bd
4006c1a
0f701bd
1ca0012
0f701bd
 
1ca0012
0f701bd
 
 
 
 
 
 
 
4006c1a
 
 
 
 
 
bbd42f8
 
 
 
 
 
 
 
4006c1a
 
 
 
 
 
 
 
 
 
 
 
0e324a0
4006c1a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import subprocess
import gradio as gr
from magika import Magika

def clone_repo(url, repo_dir):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
    }

def read_file_content(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        return file.read()

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def extract_repo_content(url):
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": error}]
    
    file_types = validate_file_types(repo_dir)
    extracted_content = []
    for file_path, file_type in file_types.items():
        file_summary = get_file_summary(file_path, file_type)
        content = {"header": file_summary}
        
        if file_type in ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"] and file_summary["size"] <= 1024 * 1024:
            try:
                content["content"] = read_file_content(file_path)
            except Exception as e:
                content["content"] = f"Failed to read file content: {str(e)}"
        else:
            content["content"] = "File too large or binary, content not captured."
        
        extracted_content.append(content)
    
    return extracted_content

def format_output(extracted_content):
    formatted_output = ""
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url)
    formatted_output = format_output(extracted_content)
    return formatted_output

app = gr.Blocks()

with app:
    gr.Markdown("# Gradio Space/Model Content Extractor")
    url_input = gr.Textbox(label="Hugging Face Space/Model URL")
    output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()