Spaces:

Vishisht16
/

code-summarizer

Running

App Files Files

xet

Community

Vishisht16 commited on Apr 13

Commit

188e720

1 Parent(s): 4b6ece1

Add application code

Browse files

Files changed (9) hide show

app.py +95 -0
code_summarizer/New Text Document.txt +0 -0
code_summarizer/__init__.py +29 -0
code_summarizer/firebase_db.py +69 -0
code_summarizer/language_parsers.py +76 -0
code_summarizer/repo_downloader.py +28 -0
code_summarizer/summarizer.py +95 -0
interface.py +100 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import argparse
+from pathlib import Path
+import json
+import logging
+import sys
+import time
+from code_summarizer import (
+    clone_repo,
+    summarize_repo,
+    upload_summary_to_firebase,
+    get_summaries_by_repo,
+    is_firestore_available
+)
+# Import device/model status separately if needed for logging
+from code_summarizer.summarizer import device as summarizer_device, MODEL_LOADED as SUMMARIZER_LOADED
+# Basic logging config for the CLI app
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [CLI] %(message)s')
+log = logging.getLogger(__name__)
+REPO_CLONE_DIR = "cloned_repo_cli"
+OUTPUT_DIR = Path("outputs")
+OUTPUT_FILE = OUTPUT_DIR / "summaries.json"
+def run_pipeline(repo_url: str, skip_existing: bool = False, save_local: bool = True):
+    start_time = time.time()
+    log.info(f"Pipeline starting for: {repo_url}")
+    if not SUMMARIZER_LOADED:
+         log.error("Summarizer Model Not Loaded. Exiting.")
+         sys.exit(1)
+    firestore_ready = is_firestore_available()
+    if not firestore_ready:
+        log.warning("Firebase is not available. Uploads/Checks will be skipped.")
+    if skip_existing and firestore_ready:
+        log.info("Checking for existing summaries...")
+        if get_summaries_by_repo(repo_url):
+            log.warning("Skipping. Found existing summaries in Firebase.")
+            return
+    log.info("Cloning repository...")
+    clone_dir_path = Path(REPO_CLONE_DIR)
+    if not clone_repo(repo_url, str(clone_dir_path)):
+        log.error("Repo cloning failed. Exiting.")
+        sys.exit(1)
+    log.info(f"Running summarization (device: {summarizer_device})...")
+    summaries = summarize_repo(clone_dir_path, repo_url)
+    if not summaries:
+        log.warning("No functions found or summarization failed.")
+        return
+    log.info(f"Summarization complete. Found {len(summaries)} functions.")
+    if firestore_ready:
+        log.info(f"Uploading {len(summaries)} summaries to Firebase...")
+        upload_count = 0
+        for i, summary in enumerate(summaries):
+            upload_summary_to_firebase(summary)
+            upload_count +=1
+            if (i + 1) % 100 == 0:
+                 log.info(f"  Uploaded {i+1}/{len(summaries)}...")
+        log.info(f"Finished uploading {upload_count} summaries.")
+    else:
+        log.info("Skipping Firebase upload.")
+    if save_local:
+        log.info(f"Saving summaries locally to {OUTPUT_FILE}...")
+        try:
+            OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+            with open(OUTPUT_FILE, "w", encoding='utf-8') as f:
+                json.dump(summaries, f, indent=2, default=str)
+            log.info(f"Saved local backup to {OUTPUT_FILE}")
+        except Exception as e:
+            log.error(f"Failed to save local backup: {e}", exc_info=True)
+    duration = time.time() - start_time
+    log.info(f"✅ Pipeline completed in {duration:.2f} seconds.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Code Summarizer CLI", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--url", required=True, help="HTTPS URL of the public GitHub repository.")
+    parser.add_argument("--skip_existing", action="store_true", help="Skip if repo already summarized in Firebase.")
+    parser.add_argument("--no_save", action="store_true", help="Disable saving local summaries.json.")
+    args = parser.parse_args()
+    run_pipeline(
+        repo_url=args.url,
+        skip_existing=args.skip_existing,
+        save_local=not args.no_save
+    )

code_summarizer/New Text Document.txt ADDED Viewed

File without changes

code_summarizer/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import logging
+log = logging.getLogger(__name__)
+log.addHandler(logging.NullHandler())
+from .repo_downloader import clone_repo
+from .language_parsers import extract_code_snippets, get_language_by_extension, SUPPORTED_EXTENSIONS
+from .summarizer import summarize_repo, summarize_file, get_embedding, generate_summary
+from .firebase_db import upload_summary_to_firebase, get_summaries_by_repo, is_firestore_available
+VERSION = "0.1.0"
+__all__ = [
+    "clone_repo",
+    "extract_code_snippets",
+    "get_language_by_extension",
+    "SUPPORTED_EXTENSIONS",
+    "summarize_repo",
+    "summarize_file",
+    "get_embedding",
+    "generate_summary",
+    "upload_summary_to_firebase",
+    "get_summaries_by_repo",
+    "is_firestore_available",
+    "VERSION"
+]
+log.info(f"Code Summarizer Package v{VERSION} initialized.")

code_summarizer/firebase_db.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import firebase_admin
+from firebase_admin import credentials, firestore
+import os
+import logging
+from typing import List, Dict
+log = logging.getLogger(__name__)
+FIRESTORE_INITIALIZED = False
+db = None
+firebase_secret_json = os.environ.get('FIREBASE_SERVICE_ACCOUNT_JSON')
+if firebase_secret_json:
+    try:
+        import json
+        # Convert the JSON string from the env var into a dictionary
+        credentials_dict = json.loads(firebase_secret_json)
+        if not firebase_admin._apps:
+            cred = credentials.Certificate(credentials_dict)
+            firebase_admin.initialize_app(cred)
+            log.info("Firebase Admin SDK initialized from Secret.")
+        else:
+            log.info("Firebase Admin SDK already initialized.")
+        db = firestore.client()
+        FIRESTORE_INITIALIZED = True
+    except Exception as e:
+        log.error(f"Failed to initialize Firebase from Secret: {e}", exc_info=True)
+else:
+    log.warning("Firebase Secret (FIREBASE_SERVICE_ACCOUNT_JSON) not found in environment. Firebase disabled.")
+def is_firestore_available() -> bool:
+    return FIRESTORE_INITIALIZED and db is not None
+def upload_summary_to_firebase(summary: Dict):
+    if not is_firestore_available():
+        log.debug("Firestore unavailable, skipping upload.")
+        return
+    required_keys = ['repo_url', 'file_path', 'language', 'function_code', 'summary']
+    if not all(key in summary for key in required_keys):
+        log.warning(f"Skipped upload: Missing required keys. Has: {list(summary.keys())}")
+        return
+    try:
+        if "embedding" in summary and not isinstance(summary["embedding"], list):
+            log.warning(f"Removing invalid non-list embedding before upload for {summary.get('file_path')}")
+            del summary["embedding"]
+        doc_ref = db.collection("functions").document()
+        doc_ref.set(summary)
+        log.debug(f"Uploaded summary for: {summary.get('file_path')}")
+    except Exception as e:
+        log.error(f"Error uploading summary for {summary.get('file_path')} to Firebase: {e}", exc_info=True)
+def get_summaries_by_repo(repo_url: str) -> List[Dict]:
+    if not is_firestore_available():
+        log.warning("Firestore unavailable, cannot fetch summaries.")
+        return []
+    summaries = []
+    try:
+        log.info(f"Querying Firestore for repo_url: {repo_url}")
+        docs_stream = db.collection("functions").where("repo_url", "==", repo_url).stream()
+        summaries = [doc.to_dict() for doc in docs_stream]
+        log.info(f"Found {len(summaries)} existing summaries in Firestore for {repo_url}.")
+    except Exception as e:
+        log.error(f"Error fetching summaries for {repo_url} from Firebase: {e}", exc_info=True)
+        return []
+    return summaries

code_summarizer/language_parsers.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+from pathlib import Path
+from typing import List, Tuple, Dict, Optional
+import re
+import ast
+import logging
+log = logging.getLogger(__name__)
+# Note: ast.get_source_segment requires Python 3.8+
+SUPPORTED_EXTENSIONS: Dict[str, str] = {
+    ".py": "python", ".js": "javascript", ".java": "java", ".cpp": "cpp",
+    ".c": "c", ".cs": "csharp", ".ts": "typescript", ".go": "go"
+}
+# Regex patterns (simplified, may need adjustment per language)
+# WARNING: Regex-based parsing is fragile.
+patterns = {
+    "javascript": r"^(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}|(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*\{[\s\S]*?^\}",
+    "typescript": r"^(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}|(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*\{[\s\S]*?^\}",
+    "java": r"^(?:public|private|protected|static|\s)*\s*[\w<>\[\]]+\s+\w+\s*\([^)]*\)\s*(?:throws\s+[\w,\s]+)?\s*\{[\s\S]*?^\}",
+    "cpp": r"^(?:[\w:]+)\s+\**\s*[\w:]+\s*\([^)]*\)\s*(?:const)?\s*\{[\s\S]*?^\}",
+    "c": r"^(?:[\w:]+)\s+\**\s*[\w:]+\s*\([^)]*\)\s*(?:const)?\s*\{[\s\S]*?^\}",
+    "csharp": r"^(?:public|private|protected|internal|static|virtual|async|override|\s)*\s*[\w<>\[\],?]+\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}",
+    "go": r"^func(?:\s*\(\s*\w+\s+\*?\w+\s*\))?\s+\w+\s*\([^)]*\)\s*(?:[\w\s,()]+)?\s*\{[\s\S]*?^\}"
+}
+def get_language_by_extension(file_path: Path) -> Optional[str]:
+    return SUPPORTED_EXTENSIONS.get(file_path.suffix.lower())
+def extract_python_functions(file_path: Path) -> List[str]:
+    functions = []
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            source = f.read()
+        tree = ast.parse(source, filename=str(file_path))
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                try:
+                    segment = ast.get_source_segment(source, node)
+                    if segment:
+                        functions.append(segment)
+                except Exception: # Ignore segment extraction errors
+                    pass
+    except (FileNotFoundError, SyntaxError, UnicodeDecodeError) as e:
+        log.warning(f"Skipping file {file_path} due to parsing error: {e}")
+    except Exception as e:
+        log.error(f"Unexpected error parsing Python file {file_path}: {e}", exc_info=True)
+    return functions
+def extract_functions_by_regex(file_path: Path, pattern: str) -> List[str]:
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            code = f.read()
+        return re.findall(pattern, code, re.DOTALL | re.MULTILINE)
+    except (FileNotFoundError, UnicodeDecodeError) as e:
+         log.warning(f"Skipping file {file_path} due to read error: {e}")
+         return []
+    except Exception as e:
+        log.error(f"Failed regex extraction on {file_path}: {e}", exc_info=True)
+        return []
+def extract_code_snippets(file_path: Path) -> Tuple[Optional[str], List[str]]:
+    language = get_language_by_extension(file_path)
+    if language is None:
+        return None, []
+    if language == "python":
+        return language, extract_python_functions(file_path)
+    pattern = patterns.get(language)
+    if pattern:
+        return language, extract_functions_by_regex(file_path, pattern)
+    else:
+        log.debug(f"No regex pattern defined for language: {language} in file {file_path}")
+        return language, []

code_summarizer/repo_downloader.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import shutil
+from git import Repo, GitCommandError
+import logging
+log = logging.getLogger(__name__)
+def clone_repo(repo_url: str, dest_folder: str = "cloned_repo") -> bool:
+    """Clones or replaces a git repository locally."""
+    if os.path.exists(dest_folder):
+        log.info(f"Removing existing directory: {dest_folder}")
+        try:
+            shutil.rmtree(dest_folder)
+        except OSError as e:
+            log.error(f"Error removing directory {dest_folder}: {e}")
+            return False
+    try:
+        log.info(f"Cloning repo from {repo_url} into {dest_folder}...")
+        Repo.clone_from(repo_url, dest_folder)
+        log.info("Repo cloned successfully.")
+        return True
+    except GitCommandError as e:
+        log.error(f"Error cloning repo: Git command failed - {e}")
+        return False
+    except Exception as e:
+        log.error(f"An unexpected error occurred during cloning: {e}")
+        return False

code_summarizer/summarizer.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+from transformers import RobertaTokenizer, RobertaModel, logging as hf_logging
+from typing import List, Dict, Optional
+from code_summarizer.language_parsers import extract_code_snippets, SUPPORTED_EXTENSIONS
+from pathlib import Path
+import numpy as np
+import logging
+log = logging.getLogger(__name__)
+hf_logging.set_verbosity_error()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+log.info(f"Summarizer using device: {device}")
+MODEL_LOADED = False
+tokenizer = None
+model = None
+try:
+    log.info("Loading CodeBERT tokenizer/model...")
+    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
+    model = RobertaModel.from_pretrained("microsoft/codebert-base")
+    model = model.to(device)
+    model.eval()
+    MODEL_LOADED = True
+    log.info("CodeBERT model loaded successfully.")
+except Exception as e:
+    log.error(f"Failed to load CodeBERT model: {e}", exc_info=True)
+def get_embedding(code: str) -> Optional[List[float]]:
+    if not MODEL_LOADED or tokenizer is None or model is None:
+        return None
+    try:
+        inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512, padding=True)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+        return embedding.tolist()
+    except Exception as e:
+        log.warning(f"Failed to generate embedding: {e}. Snippet start: {code[:50]}...")
+        return None
+def generate_summary(snippet: str) -> str:
+    try:
+        lines = snippet.strip().split('\n')
+        header = next((line.strip() for line in lines if line.strip() and not (line.strip().startswith('#') or line.strip().startswith('//') or line.strip().startswith('/*'))), "")
+        header = (header[:100] + "...") if len(header) > 100 else header
+        return f"Function/method starting with `{header}`." if header else "N/A Summary"
+    except Exception:
+        return "Summary generation failed."
+def summarize_file(file_path: Path, repo_url: str) -> List[Dict]:
+    language, snippets = extract_code_snippets(file_path)
+    if not snippets:
+        return []
+    results = []
+    log.debug(f"Summarizing {len(snippets)} snippets from {file_path}...")
+    for snippet in snippets:
+        if not snippet or snippet.isspace():
+            continue
+        embedding = get_embedding(snippet)
+        summary = generate_summary(snippet)
+        summary_data = {
+            "repo_url": repo_url,
+            "file_path": str(file_path.as_posix()),
+            "language": language,
+            "function_code": snippet,
+            "summary": summary,
+        }
+        if embedding is not None:
+             summary_data["embedding"] = embedding
+        results.append(summary_data)
+    return results
+def summarize_repo(repo_dir: Path, repo_url: str) -> List[Dict]:
+    all_results = []
+    log.info(f"Starting summarization for repository: {repo_url}")
+    supported_extensions = set(SUPPORTED_EXTENSIONS.keys())
+    files_processed_count = 0
+    for file in repo_dir.rglob("*"):
+        if file.is_file() and file.suffix.lower() in supported_extensions:
+            log.debug(f"Processing file: {file}")
+            try:
+                file_results = summarize_file(file, repo_url)
+                if file_results:
+                    all_results.extend(file_results)
+                    files_processed_count += 1
+            except Exception as e:
+                log.error(f"Failed to process file {file}: {e}", exc_info=True)
+    log.info(f"Summarization complete for {repo_url}. Processed {files_processed_count} files, found {len(all_results)} functions.")
+    return all_results

interface.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import gradio as gr
+from pathlib import Path
+import logging
+from code_summarizer import (
+    clone_repo,
+    summarize_repo,
+    upload_summary_to_firebase,
+    is_firestore_available
+)
+# Import device/model status separately
+from code_summarizer.summarizer import device as summarizer_device, MODEL_LOADED as SUMMARIZER_LOADED
+log = logging.getLogger(__name__)
+REPO_CLONE_DIR = "cloned_repo_gradio"
+def format_summaries_for_display(summaries: list) -> str:
+    if not summaries: return "No summaries generated."
+    limit = 5
+    output = f"✅ Found {len(summaries)} functions.\n"
+    output += f"Firestore available: {'Yes' if is_firestore_available() else 'No'}\n---\n"
+    for i, summary in enumerate(summaries[:limit]):
+         output += f"File: {summary.get('file_path', '?')}\nLang: {summary.get('language', '?')}\n"
+         output += f"Summary: {summary.get('summary', '?')}\n"
+         output += f"Embedding: {'Yes' if 'embedding' in summary else 'No'}\n---\n"
+    if len(summaries) > limit:
+        output += f"... and {len(summaries) - limit} more."
+    return output
+def summarize_from_url(repo_url: str):
+    if not repo_url or not repo_url.startswith("https"):
+        yield "❌ Invalid HTTPS GitHub URL."
+        return
+    if not SUMMARIZER_LOADED:
+         yield "❌ Summarizer Model Not Loaded. Cannot proceed."
+         return
+    yield "⏳ Cloning repository..."
+    clone_dir_path = Path(REPO_CLONE_DIR)
+    if not clone_repo(repo_url, str(clone_dir_path)):
+        yield "❌ Failed to clone repo."
+        return
+    yield f"⏳ Summarizing code (using {summarizer_device})..."
+    summaries = summarize_repo(clone_dir_path, repo_url)
+    if not summaries:
+        yield "⚠️ Repo cloned, but no functions found."
+        return
+    status = f"✅ Summarized {len(summaries)} functions."
+    yield status + " Uploading to Firebase..."
+    upload_count = 0
+    if is_firestore_available():
+        for summary in summaries:
+            try:
+                upload_summary_to_firebase(summary)
+                upload_count += 1
+            except Exception as e:
+                log.error(f"Gradio UI: Firebase upload error: {e}")
+        status += f" Uploaded {upload_count} to Firebase."
+        yield status + "\n---\n" + format_summaries_for_display(summaries)
+    else:
+        status += " Firebase unavailable, skipping upload."
+        yield status + "\n---\n" + format_summaries_for_display(summaries)
+def perform_web_search(query: str):
+    # Placeholder - Replace with actual search implementation
+    return f"🔎 Web search (placeholder) for: '{query}'"
+def launch_interface():
+    with gr.Blocks(title="Code Summarizer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🔍 Code Summarizer & Search")
+        with gr.Tab("Repo Summarizer"):
+            repo_url_input = gr.Textbox(label="GitHub Repo URL", placeholder="https://github.com/user/repo")
+            summarize_button = gr.Button("Summarize & Upload", variant="primary")
+            status_output = gr.Textbox(label="Status / Output", lines=10, interactive=False)
+            summarize_button.click(fn=summarize_from_url, inputs=repo_url_input, outputs=status_output)
+        with gr.Tab("Web Code Search (Placeholder)"):
+            search_query_input = gr.Textbox(label="Search Query", placeholder="e.g., binary search tree cpp")
+            search_button = gr.Button("Search Web", variant="secondary")
+            search_output_display = gr.Textbox(label="Web Search Results", lines=5, interactive=False)
+            search_button.click(fn=perform_web_search, inputs=search_query_input, outputs=search_output_display)
+    log.info("Launching Gradio interface...")
+    demo.launch()
+if __name__ == "__main__":
+    # Basic logging setup for the interface if run directly
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [Interface] %(message)s')
+    if not SUMMARIZER_LOADED:
+         log.error("Summarizer model failed to load. Interface functionality will be limited.")
+    # Add this check for Firebase as well, since the interface relies on it
+    if not is_firestore_available():
+         log.warning("Firebase is not available. Upload/check functionality will be disabled.")
+    launch_interface()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gitpython>=3.1.0,<4.0.0
+torch>=1.9.0,<3.0.0
+transformers>=4.10.0,<5.0.0
+numpy>=1.19.0,<2.0.0
+gradio>=3.15.0,<5.0.0
+firebase-admin>=5.0.0,<7.0.0