Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,590 Bytes
98d1d12 4006c1a 0f701bd 4006c1a 98d1d12 1ca0012 98d1d12 4006c1a 98d1d12 4006c1a 98d1d12 6be117b 98d1d12 0f701bd 4006c1a 98d1d12 4006c1a 98d1d12 4006c1a 0f701bd 4006c1a 0f701bd 1ca0012 0f701bd 1ca0012 0f701bd 4006c1a bbd42f8 4006c1a 0e324a0 4006c1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
import subprocess
import gradio as gr
from magika import Magika
def clone_repo(url, repo_dir):
env = os.environ.copy()
env['GIT_LFS_SKIP_SMUDGE'] = '1'
result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
if result.returncode != 0:
return False, result.stderr
return True, None
def get_file_summary(file_path, file_type):
size = os.path.getsize(file_path)
return {
"name": os.path.relpath(file_path),
"type": file_type,
"size": size,
}
def read_file_content(file_path):
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
return file.read()
def validate_file_types(directory):
m = Magika()
file_types = {}
for root, _, files in os.walk(directory):
if '.git' in root:
continue
for file_name in files:
file_path = os.path.join(root, file_name)
try:
with open(file_path, 'rb') as file:
file_bytes = file.read()
result = m.identify_bytes(file_bytes)
file_types[file_path] = result.output.ct_label
except Exception as e:
file_types[file_path] = f"Error: {str(e)}"
return file_types
def extract_repo_content(url):
repo_dir = "./temp_repo"
if os.path.exists(repo_dir):
subprocess.run(["rm", "-rf", repo_dir])
success, error = clone_repo(url, repo_dir)
if not success:
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": error}]
file_types = validate_file_types(repo_dir)
extracted_content = []
for file_path, file_type in file_types.items():
file_summary = get_file_summary(file_path, file_type)
content = {"header": file_summary}
if file_type in ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"] and file_summary["size"] <= 1024 * 1024:
try:
content["content"] = read_file_content(file_path)
except Exception as e:
content["content"] = f"Failed to read file content: {str(e)}"
else:
content["content"] = "File too large or binary, content not captured."
extracted_content.append(content)
return extracted_content
def format_output(extracted_content):
formatted_output = ""
for file_data in extracted_content:
if isinstance(file_data, dict) and 'header' in file_data:
formatted_output += f"### File: {file_data['header']['name']}\n"
formatted_output += f"**Type:** {file_data['header']['type']}\n"
formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
formatted_output += "#### Content:\n"
formatted_output += f"```\n{file_data['content']}\n```\n\n"
else:
formatted_output += "Error in file data format.\n"
return formatted_output
def extract_and_display(url):
extracted_content = extract_repo_content(url)
formatted_output = format_output(extracted_content)
return formatted_output
app = gr.Blocks()
with app:
gr.Markdown("# Gradio Space/Model Content Extractor")
url_input = gr.Textbox(label="Hugging Face Space/Model URL")
output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
extract_button = gr.Button("Extract Content")
extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
app.launch()
|