namgyu-youn commited on
Commit
545ce24
·
verified ·
1 Parent(s): be95cb6

Upload 7 files

Browse files
scripts/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .analyzer import TopicAnalyzer
2
+ from .fetcher import GitHubFetcher
3
+ from .topic_list import TOPIC_LIST
4
+
5
+ __all__ = ['TopicAnalyzer', 'GitHubFetcher', 'TOPIC_LIST']
scripts/analyzer.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
2
+ from typing import List, Dict
3
+ from .topic_list import TOPIC_LIST
4
+
5
+ class TopicAnalyzer:
6
+ def __init__(self):
7
+ self.device = "cpu"
8
+ self.model_name = "microsoft/deberta-v3-base"
9
+ self.tokenizer = None
10
+ self.model = None
11
+ self.classifier = None
12
+ self.max_length = 1024
13
+ self.topic_hierarchy = TOPIC_LIST
14
+ self.set_classifier()
15
+
16
+ def set_device(self, device: str):
17
+ if device != self.device:
18
+ self.device = device
19
+ self.set_classifier()
20
+
21
+ def set_classifier(self):
22
+ try:
23
+ self.tokenizer = AutoTokenizer.from_pretrained(
24
+ self.model_name,
25
+ use_fast=True
26
+ )
27
+ self.model = AutoModelForSequenceClassification.from_pretrained(
28
+ self.model_name
29
+ ).to(self.device)
30
+
31
+ # Set zero-shot pipeline
32
+ self.classifier = pipeline(
33
+ "zero-shot-classification",
34
+ model=self.model,
35
+ tokenizer=self.tokenizer,
36
+ device=self.device
37
+ )
38
+ except Exception as e:
39
+ print(f"Error initializing classifier: {str(e)}")
40
+ raise
41
+
42
+ async def generate_topics(self, text: str, category: str, subcategory: str) -> List[Dict]:
43
+ try:
44
+ all_topics = []
45
+ for subcat in self.topic_hierarchy[category].values():
46
+ all_topics.extend(subcat)
47
+
48
+ result = self.classifier(
49
+ text[:self.max_length],
50
+ all_topics,
51
+ multi_label=True
52
+ )
53
+
54
+ topics = [
55
+ {"topic": topic, "score": score}
56
+ for topic, score in zip(result["labels"], result["scores"])
57
+ if score > 0.1
58
+ ]
59
+
60
+ return sorted(topics, key=lambda x: x["score"], reverse=True)[:10]
61
+
62
+ except Exception as e:
63
+ print(f"Error generating topics: {str(e)}")
64
+ return []
scripts/error_handler.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Dict, Any
2
+ from pydantic import BaseModel, Field, ConfigDict
3
+ from enum import Enum
4
+
5
+ class ErrorLevel(str, Enum):
6
+ INFO = "info"
7
+ WARNING = "warning"
8
+ ERROR = "error"
9
+ CRITICAL = "critical"
10
+
11
+ class ErrorDetail(BaseModel):
12
+ """Base model for error details"""
13
+ code: str = Field(..., description="Error code for the specific error")
14
+ message: str = Field(..., description="Human readable error message")
15
+ level: ErrorLevel = Field(default=ErrorLevel.ERROR, description="Severity level of the error")
16
+ context: Optional[Dict[str, Any]] = Field(default=None, description="Additional context about the error")
17
+
18
+ class GitHubURLError(ErrorDetail):
19
+ """Model for GitHub URL related errors"""
20
+ model_config = ConfigDict(json_schema_extra={
21
+ "example": {
22
+ "code": "INVALID_GITHUB_URL",
23
+ "message": "The provided URL is not a valid GitHub repository URL",
24
+ "level": ErrorLevel.ERROR,
25
+ "context": {"url": "https://invalid-url.com"}
26
+ }
27
+ })
28
+
29
+ class TopicAnalysisError(ErrorDetail):
30
+ """Model for topic analysis related errors"""
31
+ model_config = ConfigDict(json_schema_extra={
32
+ "example": {
33
+ "code": "TOPIC_GENERATION_FAILED",
34
+ "message": "Failed to generate topics from the content",
35
+ "level": ErrorLevel.ERROR,
36
+ "context": {"model": "deberta-v3-base", "error": "Model loading failed"}
37
+ }
38
+ })
39
+
40
+ class APIResponse(BaseModel):
41
+ """Model for API responses"""
42
+ success: bool = Field(default=True, description="Indicates if the operation was successful")
43
+ data: Optional[Dict[str, Any]] = Field(default=None, description="Response data when operation is successful")
44
+ errors: Optional[List[ErrorDetail]] = Field(default=None, description="List of errors if any occurred")
45
+
46
+ def model_post_init(self, __context):
47
+ """Post initialization hook to update success status based on errors"""
48
+ if self.errors:
49
+ self.success = False
50
+
51
+ class ErrorHandler:
52
+ """Handler for managing and creating error responses"""
53
+ @staticmethod
54
+ def handle_github_url_error(url: str, error_message: str) -> APIResponse:
55
+ """
56
+ Handle GitHub URL related errors
57
+
58
+ Args:
59
+ url: The problematic URL
60
+ error_message: Description of the error
61
+
62
+ Returns:
63
+ APIResponse with error details
64
+ """
65
+ error = GitHubURLError(
66
+ code="INVALID_GITHUB_URL",
67
+ message=f"Invalid GitHub URL: {error_message}",
68
+ context={"url": url}
69
+ )
70
+ return APIResponse(success=False, errors=[error])
71
+
72
+ @staticmethod
73
+ def handle_topic_analysis_error(error_message: str, context: Dict[str, Any] = None) -> APIResponse:
74
+ """
75
+ Handle topic analysis related errors
76
+
77
+ Args:
78
+ error_message: Description of the error
79
+ context: Additional context information
80
+
81
+ Returns:
82
+ APIResponse with error details
83
+ """
84
+ error = TopicAnalysisError(
85
+ code="TOPIC_GENERATION_FAILED",
86
+ message=f"Topic generation failed: {error_message}",
87
+ context=context or {}
88
+ )
89
+ return APIResponse(success=False, errors=[error])
90
+
91
+ @staticmethod
92
+ def handle_file_fetch_error(file_path: str, error_message: str) -> APIResponse:
93
+ """
94
+ Handle file fetching related errors
95
+
96
+ Args:
97
+ file_path: Path of the file that failed to fetch
98
+ error_message: Description of the error
99
+
100
+ Returns:
101
+ APIResponse with error details
102
+ """
103
+ error = ErrorDetail(
104
+ code="FILE_FETCH_FAILED",
105
+ message=f"Failed to fetch file: {error_message}",
106
+ context={"file_path": file_path}
107
+ )
108
+ return APIResponse(success=False, errors=[error])
109
+
110
+ @staticmethod
111
+ def success_response(data: Dict[str, Any]) -> APIResponse:
112
+ """
113
+ Create a success response
114
+
115
+ Args:
116
+ data: The response data to be returned
117
+
118
+ Returns:
119
+ APIResponse with success status and data
120
+ """
121
+ return APIResponse(success=True, data=data)
scripts/fetcher.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ from urllib.parse import urlparse
3
+
4
+ class GitHubFetcher:
5
+ def __init__(self):
6
+ self.base_url = "https://raw.githubusercontent.com"
7
+
8
+ def parse_github_url(self, url: str) -> tuple[str, str, str, str]:
9
+ """Parse GitHub URL into components: owner, repo, branch, file_path."""
10
+ parsed = urlparse(url)
11
+ if not parsed.scheme:
12
+ raise ValueError("URL must include 'https://'")
13
+
14
+ path_parts = parsed.path.strip("/").split("/")
15
+
16
+ if len(path_parts) < 2:
17
+ raise ValueError("Invalid GitHub URL")
18
+
19
+ owner = path_parts[0]
20
+ repo = path_parts[1]
21
+ branch = "main"
22
+ file_path = "README.md"
23
+
24
+ return owner, repo, branch, file_path
25
+
26
+ async def fetch_readme(self, url: str) -> str:
27
+ try:
28
+ owner, repo, branch, file_path = self.parse_github_url(url)
29
+ raw_url = f"{self.base_url}/{owner}/{repo}/{branch}/{file_path}"
30
+
31
+ async with aiohttp.ClientSession() as session:
32
+ async with session.get(raw_url) as response:
33
+ response.raise_for_status()
34
+ return await response.text()
35
+
36
+ except Exception as e:
37
+ raise Exception(f"Failed to fetch README: {str(e)}")
scripts/github_analyzer.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Any
2
+ import aiohttp
3
+ from urllib.parse import urlparse
4
+ from .analyzer import TopicAnalyzer
5
+ from .error_handler import ErrorHandler
6
+
7
+ class GitHubAnalyzer:
8
+ CORE_FILES = [
9
+ 'README.md',
10
+ 'requirements.txt',
11
+ 'pyproject.toml',
12
+ 'package.json',
13
+ 'main.py',
14
+ 'app.py',
15
+ 'train.py'
16
+ ]
17
+
18
+ def __init__(self):
19
+ self.base_url = "https://raw.githubusercontent.com"
20
+ self.topic_analyzer = TopicAnalyzer()
21
+ self.error_handler = ErrorHandler()
22
+
23
+ def parse_github_url(self, url: str) -> tuple[str, str, str]:
24
+ """Parse GitHub URL into components."""
25
+ try:
26
+ parsed = urlparse(url)
27
+ path_parts = parsed.path.strip("/").split("/")
28
+
29
+ if len(path_parts) < 2:
30
+ return self.error_handler.handle_github_url_error(
31
+ url,
32
+ "URL must contain owner and repository"
33
+ )
34
+
35
+ owner = path_parts[0]
36
+ repo = path_parts[1]
37
+ branch = "main" # default branch
38
+
39
+ return owner, repo, branch
40
+ except Exception as e:
41
+ return self.error_handler.handle_github_url_error(url, str(e))
42
+
43
+ async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
44
+ """Fetch a single file content."""
45
+ try:
46
+ async with session.get(url) as response:
47
+ if response.status == 200:
48
+ return await response.text()
49
+ return self.error_handler.handle_file_fetch_error(
50
+ url,
51
+ f"HTTP {response.status}"
52
+ )
53
+ except Exception as e:
54
+ return self.error_handler.handle_file_fetch_error(url, str(e))
55
+
56
+ async def analyze_repository(
57
+ self,
58
+ repo_url: str,
59
+ category: str,
60
+ subcategory: str
61
+ ) -> Dict[str, Any]:
62
+ """Analyze repository and generate comprehensive topics."""
63
+ try:
64
+ files_content = await self._fetch_core_files(repo_url)
65
+ if not files_content:
66
+ return self.error_handler.handle_file_fetch_error(
67
+ repo_url,
68
+ "No core files found"
69
+ )
70
+
71
+ # Analyze README content
72
+ readme_topics = []
73
+ if 'README.md' in files_content:
74
+ readme_topics = await self.topic_analyzer.generate_topics(
75
+ files_content['README.md'],
76
+ category,
77
+ subcategory
78
+ )
79
+
80
+ # Get dependencies
81
+ dependencies = await self._analyze_dependencies(files_content)
82
+
83
+ # Analyze Python files content
84
+ code_content = ""
85
+ for file in ['main.py', 'app.py', 'train.py']:
86
+ if file in files_content:
87
+ code_content += files_content[file] + "\n"
88
+
89
+ code_topics = []
90
+ if code_content:
91
+ code_topics = await self.topic_analyzer.generate_topics(
92
+ code_content,
93
+ category,
94
+ subcategory
95
+ )
96
+
97
+ return self.error_handler.success_response({
98
+ "readme_topics": readme_topics,
99
+ "code_topics": code_topics,
100
+ "dependencies": dependencies
101
+ })
102
+
103
+ except Exception as e:
104
+ return self.error_handler.handle_topic_analysis_error(
105
+ str(e),
106
+ {"repo_url": repo_url, "category": category, "subcategory": subcategory}
107
+ )
scripts/topic_list.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TOPIC_LIST = {
2
+ "Data & AI": {
3
+ "Machine Learning": [
4
+ "Supervised-Learning", "Unsupervised-Learning", "Neural-Networks",
5
+ "Deep-Learning", "Reinforcement-Learning", "Transfer-Learning",
6
+ "AutoML", "Feature-Engineering", "Model-Optimization",
7
+ "Model-Interpretability", "MLOps", "Model-Compression",
8
+ "Edge-AI", "Few-Shot-Learning", "Active-Learning",
9
+ "Meta-Learning", "Ensemble-Methods", "Online-Learning"
10
+ ],
11
+ "Computer Vision": [
12
+ "Image-Classification", "Object-Detection", "Image-Segmentation",
13
+ "Face-Recognition", "Video-Analysis", "3D-Vision",
14
+ "Medical-Imaging", "Pose-Estimation", "OCR",
15
+ "Visual-SLAM", "Depth-Estimation", "Action-Recognition",
16
+ "Scene-Understanding", "Multi-Object-Tracking", "Visual-Reasoning"
17
+ ],
18
+ "Natural Language Processing": [
19
+ "Text-Classification", "Named-Entity-Recognition", "Machine-Translation",
20
+ "Sentiment-Analysis", "Question-Answering", "Text-Generation",
21
+ "Language-Models", "Information-Extraction", "Speech-Recognition",
22
+ "Text-Summarization", "Topic-Modeling", "Semantic-Analysis",
23
+ "Dialogue-Systems", "Document-AI", "Cross-Lingual"
24
+ ],
25
+ "AI Infrastructure": [
26
+ "Model-Serving", "Training-Pipeline", "Experiment-Tracking",
27
+ "Resource-Management", "Model-Registry", "Feature-Store",
28
+ "Data-Versioning", "Model-Monitoring", "Distributed-Training",
29
+ "AutoScaling", "GPU-Optimization", "Deployment-Pipeline"
30
+ ],
31
+ "Data Science": [
32
+ "Data-Analysis", "Data-Visualization", "Statistical-Analysis",
33
+ "Predictive-Analytics", "Time-Series-Analysis", "AB-Testing",
34
+ "Business-Intelligence", "Data-Mining", "ETL",
35
+ "Anomaly-Detection", "Forecasting", "Causal-Inference",
36
+ "Survival-Analysis", "Cohort-Analysis", "Risk-Analytics"
37
+ ],
38
+ "Big Data": [
39
+ "Data-Engineering", "Stream-Processing", "Data-Warehousing",
40
+ "Distributed-Computing", "Data-Pipeline", "Real-time-Analytics",
41
+ "Data-Lake", "Data-Governance", "Data-Quality",
42
+ "Data-Catalog", "Data-Lineage", "Data-Security"
43
+ ]
44
+ },
45
+ "Scientific": {
46
+ "Healthcare & Medical": [
47
+ "Medical-Imaging", "Clinical-Research", "Bioinformatics",
48
+ "Drug-Discovery", "Genomics", "Healthcare-Analytics",
49
+ "Patient-Monitoring", "Disease-Diagnosis", "Medical-Devices",
50
+ "Clinical-Trials", "Precision-Medicine", "Healthcare-AI",
51
+ "Telemedicine", "Medical-NLP", "Biomarker-Discovery"
52
+ ],
53
+ "Biology & Life Sciences": [
54
+ "Molecular-Biology", "Genetics", "Proteomics",
55
+ "Systems-Biology", "Neuroscience", "Biochemistry",
56
+ "Cell-Biology", "Biotechnology", "Computational-Biology",
57
+ "Evolution", "Ecology", "Immunology",
58
+ "Microbiology", "Plant-Science", "Structural-Biology"
59
+ ],
60
+ "Physics & Chemistry": [
61
+ "Quantum-Computing", "Molecular-Dynamics", "Material-Science",
62
+ "Computational-Physics", "Chemical-Engineering", "Crystallography",
63
+ "Particle-Physics", "Spectroscopy", "Thermodynamics",
64
+ "Quantum-Chemistry", "Fluid-Dynamics", "Electromagnetism",
65
+ "Statistical-Mechanics", "Photonics", "Surface-Science"
66
+ ]
67
+ }
68
+ }
scripts/utils.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+
3
+ def format_topics(topics: List[str]) -> List[str]:
4
+ return [f"#{topic.lower()}" for topic in topics]
5
+
6
+ def clean_text(text: str) -> str:
7
+ return text.strip().lower()