Spaces:

namgyu-youn
/

topicgen

Running

App Files Files Community

namgyu-youn commited on Jan 4

Commit

545ce24

verified ·

1 Parent(s): be95cb6

Upload 7 files

Browse files

Files changed (7) hide show

scripts/__init__.py +5 -0
scripts/analyzer.py +64 -0
scripts/error_handler.py +121 -0
scripts/fetcher.py +37 -0
scripts/github_analyzer.py +107 -0
scripts/topic_list.py +68 -0
scripts/utils.py +7 -0

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .analyzer import TopicAnalyzer
+from .fetcher import GitHubFetcher
+from .topic_list import TOPIC_LIST
+__all__ = ['TopicAnalyzer', 'GitHubFetcher', 'TOPIC_LIST']

scripts/analyzer.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+from typing import List, Dict
+from .topic_list import TOPIC_LIST
+class TopicAnalyzer:
+    def __init__(self):
+        self.device = "cpu"
+        self.model_name = "microsoft/deberta-v3-base"
+        self.tokenizer = None
+        self.model = None
+        self.classifier = None
+        self.max_length = 1024
+        self.topic_hierarchy = TOPIC_LIST
+        self.set_classifier()
+    def set_device(self, device: str):
+        if device != self.device:
+            self.device = device
+            self.set_classifier()
+    def set_classifier(self):
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                use_fast=True
+            )
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+                self.model_name
+            ).to(self.device)
+            # Set zero-shot pipeline
+            self.classifier = pipeline(
+                "zero-shot-classification",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=self.device
+            )
+        except Exception as e:
+            print(f"Error initializing classifier: {str(e)}")
+            raise
+    async def generate_topics(self, text: str, category: str, subcategory: str) -> List[Dict]:
+        try:
+            all_topics = []
+            for subcat in self.topic_hierarchy[category].values():
+                all_topics.extend(subcat)
+            result = self.classifier(
+                text[:self.max_length],
+                all_topics,
+                multi_label=True
+            )
+            topics = [
+                {"topic": topic, "score": score}
+                for topic, score in zip(result["labels"], result["scores"])
+                if score > 0.1
+            ]
+            return sorted(topics, key=lambda x: x["score"], reverse=True)[:10]
+        except Exception as e:
+            print(f"Error generating topics: {str(e)}")
+            return []

scripts/error_handler.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field, ConfigDict
+from enum import Enum
+class ErrorLevel(str, Enum):
+    INFO = "info"
+    WARNING = "warning"
+    ERROR = "error"
+    CRITICAL = "critical"
+class ErrorDetail(BaseModel):
+    """Base model for error details"""
+    code: str = Field(..., description="Error code for the specific error")
+    message: str = Field(..., description="Human readable error message")
+    level: ErrorLevel = Field(default=ErrorLevel.ERROR, description="Severity level of the error")
+    context: Optional[Dict[str, Any]] = Field(default=None, description="Additional context about the error")
+class GitHubURLError(ErrorDetail):
+    """Model for GitHub URL related errors"""
+    model_config = ConfigDict(json_schema_extra={
+        "example": {
+            "code": "INVALID_GITHUB_URL",
+            "message": "The provided URL is not a valid GitHub repository URL",
+            "level": ErrorLevel.ERROR,
+            "context": {"url": "https://invalid-url.com"}
+        }
+    })
+class TopicAnalysisError(ErrorDetail):
+    """Model for topic analysis related errors"""
+    model_config = ConfigDict(json_schema_extra={
+        "example": {
+            "code": "TOPIC_GENERATION_FAILED",
+            "message": "Failed to generate topics from the content",
+            "level": ErrorLevel.ERROR,
+            "context": {"model": "deberta-v3-base", "error": "Model loading failed"}
+        }
+    })
+class APIResponse(BaseModel):
+    """Model for API responses"""
+    success: bool = Field(default=True, description="Indicates if the operation was successful")
+    data: Optional[Dict[str, Any]] = Field(default=None, description="Response data when operation is successful")
+    errors: Optional[List[ErrorDetail]] = Field(default=None, description="List of errors if any occurred")
+    def model_post_init(self, __context):
+        """Post initialization hook to update success status based on errors"""
+        if self.errors:
+            self.success = False
+class ErrorHandler:
+    """Handler for managing and creating error responses"""
+    @staticmethod
+    def handle_github_url_error(url: str, error_message: str) -> APIResponse:
+        """
+        Handle GitHub URL related errors
+        Args:
+            url: The problematic URL
+            error_message: Description of the error
+        Returns:
+            APIResponse with error details
+        """
+        error = GitHubURLError(
+            code="INVALID_GITHUB_URL",
+            message=f"Invalid GitHub URL: {error_message}",
+            context={"url": url}
+        )
+        return APIResponse(success=False, errors=[error])
+    @staticmethod
+    def handle_topic_analysis_error(error_message: str, context: Dict[str, Any] = None) -> APIResponse:
+        """
+        Handle topic analysis related errors
+        Args:
+            error_message: Description of the error
+            context: Additional context information
+        Returns:
+            APIResponse with error details
+        """
+        error = TopicAnalysisError(
+            code="TOPIC_GENERATION_FAILED",
+            message=f"Topic generation failed: {error_message}",
+            context=context or {}
+        )
+        return APIResponse(success=False, errors=[error])
+    @staticmethod
+    def handle_file_fetch_error(file_path: str, error_message: str) -> APIResponse:
+        """
+        Handle file fetching related errors
+        Args:
+            file_path: Path of the file that failed to fetch
+            error_message: Description of the error
+        Returns:
+            APIResponse with error details
+        """
+        error = ErrorDetail(
+            code="FILE_FETCH_FAILED",
+            message=f"Failed to fetch file: {error_message}",
+            context={"file_path": file_path}
+        )
+        return APIResponse(success=False, errors=[error])
+    @staticmethod
+    def success_response(data: Dict[str, Any]) -> APIResponse:
+        """
+        Create a success response
+        Args:
+            data: The response data to be returned
+        Returns:
+            APIResponse with success status and data
+        """
+        return APIResponse(success=True, data=data)

scripts/fetcher.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import aiohttp
+from urllib.parse import urlparse
+class GitHubFetcher:
+    def __init__(self):
+        self.base_url = "https://raw.githubusercontent.com"
+    def parse_github_url(self, url: str) -> tuple[str, str, str, str]:
+        """Parse GitHub URL into components: owner, repo, branch, file_path."""
+        parsed = urlparse(url)
+        if not parsed.scheme:
+            raise ValueError("URL must include 'https://'")
+        path_parts = parsed.path.strip("/").split("/")
+        if len(path_parts) < 2:
+            raise ValueError("Invalid GitHub URL")
+        owner = path_parts[0]
+        repo = path_parts[1]
+        branch = "main"
+        file_path = "README.md"
+        return owner, repo, branch, file_path
+    async def fetch_readme(self, url: str) -> str:
+        try:
+            owner, repo, branch, file_path = self.parse_github_url(url)
+            raw_url = f"{self.base_url}/{owner}/{repo}/{branch}/{file_path}"
+            async with aiohttp.ClientSession() as session:
+                async with session.get(raw_url) as response:
+                    response.raise_for_status()
+                    return await response.text()
+        except Exception as e:
+            raise Exception(f"Failed to fetch README: {str(e)}")

scripts/github_analyzer.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from typing import Dict, List, Optional, Any
+import aiohttp
+from urllib.parse import urlparse
+from .analyzer import TopicAnalyzer
+from .error_handler import ErrorHandler
+class GitHubAnalyzer:
+    CORE_FILES = [
+        'README.md',
+        'requirements.txt',
+        'pyproject.toml',
+        'package.json',
+        'main.py',
+        'app.py',
+        'train.py'
+    ]
+    def __init__(self):
+        self.base_url = "https://raw.githubusercontent.com"
+        self.topic_analyzer = TopicAnalyzer()
+        self.error_handler = ErrorHandler()
+    def parse_github_url(self, url: str) -> tuple[str, str, str]:
+        """Parse GitHub URL into components."""
+        try:
+            parsed = urlparse(url)
+            path_parts = parsed.path.strip("/").split("/")
+            if len(path_parts) < 2:
+                return self.error_handler.handle_github_url_error(
+                    url,
+                    "URL must contain owner and repository"
+                )
+            owner = path_parts[0]
+            repo = path_parts[1]
+            branch = "main"  # default branch
+            return owner, repo, branch
+        except Exception as e:
+            return self.error_handler.handle_github_url_error(url, str(e))
+    async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
+        """Fetch a single file content."""
+        try:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    return await response.text()
+                return self.error_handler.handle_file_fetch_error(
+                    url,
+                    f"HTTP {response.status}"
+                )
+        except Exception as e:
+            return self.error_handler.handle_file_fetch_error(url, str(e))
+    async def analyze_repository(
+        self,
+        repo_url: str,
+        category: str,
+        subcategory: str
+    ) -> Dict[str, Any]:
+        """Analyze repository and generate comprehensive topics."""
+        try:
+            files_content = await self._fetch_core_files(repo_url)
+            if not files_content:
+                return self.error_handler.handle_file_fetch_error(
+                    repo_url,
+                    "No core files found"
+                )
+            # Analyze README content
+            readme_topics = []
+            if 'README.md' in files_content:
+                readme_topics = await self.topic_analyzer.generate_topics(
+                    files_content['README.md'],
+                    category,
+                    subcategory
+                )
+            # Get dependencies
+            dependencies = await self._analyze_dependencies(files_content)
+            # Analyze Python files content
+            code_content = ""
+            for file in ['main.py', 'app.py', 'train.py']:
+                if file in files_content:
+                    code_content += files_content[file] + "\n"
+            code_topics = []
+            if code_content:
+                code_topics = await self.topic_analyzer.generate_topics(
+                    code_content,
+                    category,
+                    subcategory
+                )
+            return self.error_handler.success_response({
+                "readme_topics": readme_topics,
+                "code_topics": code_topics,
+                "dependencies": dependencies
+            })
+        except Exception as e:
+            return self.error_handler.handle_topic_analysis_error(
+                str(e),
+                {"repo_url": repo_url, "category": category, "subcategory": subcategory}
+            )

scripts/topic_list.py ADDED Viewed

	@@ -0,0 +1,68 @@

+TOPIC_LIST = {
+    "Data & AI": {
+        "Machine Learning": [
+            "Supervised-Learning", "Unsupervised-Learning", "Neural-Networks",
+            "Deep-Learning", "Reinforcement-Learning", "Transfer-Learning",
+            "AutoML", "Feature-Engineering", "Model-Optimization",
+            "Model-Interpretability", "MLOps", "Model-Compression",
+            "Edge-AI", "Few-Shot-Learning", "Active-Learning",
+            "Meta-Learning", "Ensemble-Methods", "Online-Learning"
+        ],
+        "Computer Vision": [
+            "Image-Classification", "Object-Detection", "Image-Segmentation",
+            "Face-Recognition", "Video-Analysis", "3D-Vision",
+            "Medical-Imaging", "Pose-Estimation", "OCR",
+            "Visual-SLAM", "Depth-Estimation", "Action-Recognition",
+            "Scene-Understanding", "Multi-Object-Tracking", "Visual-Reasoning"
+        ],
+        "Natural Language Processing": [
+            "Text-Classification", "Named-Entity-Recognition", "Machine-Translation",
+            "Sentiment-Analysis", "Question-Answering", "Text-Generation",
+            "Language-Models", "Information-Extraction", "Speech-Recognition",
+            "Text-Summarization", "Topic-Modeling", "Semantic-Analysis",
+            "Dialogue-Systems", "Document-AI", "Cross-Lingual"
+        ],
+        "AI Infrastructure": [
+            "Model-Serving", "Training-Pipeline", "Experiment-Tracking",
+            "Resource-Management", "Model-Registry", "Feature-Store",
+            "Data-Versioning", "Model-Monitoring", "Distributed-Training",
+            "AutoScaling", "GPU-Optimization", "Deployment-Pipeline"
+        ],
+        "Data Science": [
+            "Data-Analysis", "Data-Visualization", "Statistical-Analysis",
+            "Predictive-Analytics", "Time-Series-Analysis", "AB-Testing",
+            "Business-Intelligence", "Data-Mining", "ETL",
+            "Anomaly-Detection", "Forecasting", "Causal-Inference",
+            "Survival-Analysis", "Cohort-Analysis", "Risk-Analytics"
+        ],
+        "Big Data": [
+            "Data-Engineering", "Stream-Processing", "Data-Warehousing",
+            "Distributed-Computing", "Data-Pipeline", "Real-time-Analytics",
+            "Data-Lake", "Data-Governance", "Data-Quality",
+            "Data-Catalog", "Data-Lineage", "Data-Security"
+        ]
+    },
+    "Scientific": {
+        "Healthcare & Medical": [
+            "Medical-Imaging", "Clinical-Research", "Bioinformatics",
+            "Drug-Discovery", "Genomics", "Healthcare-Analytics",
+            "Patient-Monitoring", "Disease-Diagnosis", "Medical-Devices",
+            "Clinical-Trials", "Precision-Medicine", "Healthcare-AI",
+            "Telemedicine", "Medical-NLP", "Biomarker-Discovery"
+        ],
+        "Biology & Life Sciences": [
+            "Molecular-Biology", "Genetics", "Proteomics",
+            "Systems-Biology", "Neuroscience", "Biochemistry",
+            "Cell-Biology", "Biotechnology", "Computational-Biology",
+            "Evolution", "Ecology", "Immunology",
+            "Microbiology", "Plant-Science", "Structural-Biology"
+        ],
+        "Physics & Chemistry": [
+            "Quantum-Computing", "Molecular-Dynamics", "Material-Science",
+            "Computational-Physics", "Chemical-Engineering", "Crystallography",
+            "Particle-Physics", "Spectroscopy", "Thermodynamics",
+            "Quantum-Chemistry", "Fluid-Dynamics", "Electromagnetism",
+            "Statistical-Mechanics", "Photonics", "Surface-Science"
+        ]
+    }
+}

scripts/utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from typing import List, Dict
+def format_topics(topics: List[str]) -> List[str]:
+    return [f"#{topic.lower()}" for topic in topics]
+def clean_text(text: str) -> str:
+    return text.strip().lower()