Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- scripts/github_analyzer.py +149 -11
scripts/github_analyzer.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
-
from typing import Dict, List, Optional, Any
|
2 |
import aiohttp
|
3 |
from urllib.parse import urlparse
|
4 |
from .analyzer import TopicAnalyzer
|
5 |
from .error_handler import ErrorHandler
|
6 |
|
7 |
class GitHubAnalyzer:
|
|
|
|
|
|
|
8 |
CORE_FILES = [
|
9 |
'README.md',
|
10 |
'requirements.txt',
|
@@ -16,12 +19,33 @@ class GitHubAnalyzer:
|
|
16 |
]
|
17 |
|
18 |
def __init__(self):
|
|
|
19 |
self.base_url = "https://raw.githubusercontent.com"
|
20 |
self.topic_analyzer = TopicAnalyzer()
|
21 |
self.error_handler = ErrorHandler()
|
22 |
|
23 |
-
def
|
24 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
try:
|
26 |
parsed = urlparse(url)
|
27 |
path_parts = parsed.path.strip("/").split("/")
|
@@ -41,17 +65,121 @@ class GitHubAnalyzer:
|
|
41 |
return self.error_handler.handle_github_url_error(url, str(e))
|
42 |
|
43 |
async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
44 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
try:
|
46 |
async with session.get(url) as response:
|
47 |
if response.status == 200:
|
48 |
return await response.text()
|
49 |
-
return
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
async def analyze_repository(
|
57 |
self,
|
@@ -59,7 +187,17 @@ class GitHubAnalyzer:
|
|
59 |
category: str,
|
60 |
subcategory: str
|
61 |
) -> Dict[str, Any]:
|
62 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
try:
|
64 |
files_content = await self._fetch_core_files(repo_url)
|
65 |
if not files_content:
|
|
|
1 |
+
from typing import Dict, List, Optional, Any, Tuple
|
2 |
import aiohttp
|
3 |
from urllib.parse import urlparse
|
4 |
from .analyzer import TopicAnalyzer
|
5 |
from .error_handler import ErrorHandler
|
6 |
|
7 |
class GitHubAnalyzer:
|
8 |
+
"""
|
9 |
+
Analyzer for GitHub repositories that processes files and generates topics
|
10 |
+
"""
|
11 |
CORE_FILES = [
|
12 |
'README.md',
|
13 |
'requirements.txt',
|
|
|
19 |
]
|
20 |
|
21 |
def __init__(self):
|
22 |
+
"""Initialize the GitHubAnalyzer with base URL and required components"""
|
23 |
self.base_url = "https://raw.githubusercontent.com"
|
24 |
self.topic_analyzer = TopicAnalyzer()
|
25 |
self.error_handler = ErrorHandler()
|
26 |
|
27 |
+
def set_device(self, device: str):
|
28 |
+
"""
|
29 |
+
Set the device for the topic analyzer
|
30 |
+
|
31 |
+
Args:
|
32 |
+
device: Device to use ('cpu' or 'cuda')
|
33 |
+
"""
|
34 |
+
self.topic_analyzer.set_device(device)
|
35 |
+
|
36 |
+
def parse_github_url(self, url: str) -> Tuple[str, str, str]:
|
37 |
+
"""
|
38 |
+
Parse GitHub URL into components
|
39 |
+
|
40 |
+
Args:
|
41 |
+
url: GitHub repository URL
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
Tuple containing (owner, repo, branch)
|
45 |
+
|
46 |
+
Raises:
|
47 |
+
ValueError: If URL format is invalid
|
48 |
+
"""
|
49 |
try:
|
50 |
parsed = urlparse(url)
|
51 |
path_parts = parsed.path.strip("/").split("/")
|
|
|
65 |
return self.error_handler.handle_github_url_error(url, str(e))
|
66 |
|
67 |
async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
68 |
+
"""
|
69 |
+
Fetch a single file content from GitHub
|
70 |
+
|
71 |
+
Args:
|
72 |
+
session: aiohttp client session
|
73 |
+
url: URL of the file to fetch
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
File content or None if fetch fails
|
77 |
+
"""
|
78 |
try:
|
79 |
async with session.get(url) as response:
|
80 |
if response.status == 200:
|
81 |
return await response.text()
|
82 |
+
return None
|
83 |
+
except Exception:
|
84 |
+
return None
|
85 |
+
|
86 |
+
async def _fetch_core_files(self, repo_url: str) -> Dict[str, str]:
|
87 |
+
"""
|
88 |
+
Fetch content of core files from repository
|
89 |
+
|
90 |
+
Args:
|
91 |
+
repo_url: GitHub repository URL
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
Dictionary mapping filenames to their content
|
95 |
+
"""
|
96 |
+
owner, repo, branch = self.parse_github_url(repo_url)
|
97 |
+
files_content = {}
|
98 |
+
|
99 |
+
async with aiohttp.ClientSession() as session:
|
100 |
+
for file in self.CORE_FILES:
|
101 |
+
url = f"{self.base_url}/{owner}/{repo}/{branch}/{file}"
|
102 |
+
content = await self._fetch_file(session, url)
|
103 |
+
if content:
|
104 |
+
files_content[file] = content
|
105 |
+
|
106 |
+
return files_content
|
107 |
+
|
108 |
+
def _parse_poetry_deps(self, content: str) -> List[str]:
|
109 |
+
"""
|
110 |
+
Parse dependencies from pyproject.toml content
|
111 |
+
|
112 |
+
Args:
|
113 |
+
content: Content of pyproject.toml file
|
114 |
+
|
115 |
+
Returns:
|
116 |
+
List of dependency names
|
117 |
+
"""
|
118 |
+
deps = set()
|
119 |
+
in_deps_section = False
|
120 |
+
|
121 |
+
for line in content.split('\n'):
|
122 |
+
line = line.strip()
|
123 |
+
|
124 |
+
# Check if we're entering the dependencies section
|
125 |
+
if '[tool.poetry.dependencies]' in line:
|
126 |
+
in_deps_section = True
|
127 |
+
continue
|
128 |
+
|
129 |
+
# Check if we're exiting the dependencies section
|
130 |
+
if in_deps_section and line.startswith('['):
|
131 |
+
in_deps_section = False
|
132 |
+
continue
|
133 |
+
|
134 |
+
# Parse dependency line if we're in the dependencies section
|
135 |
+
if in_deps_section and '=' in line:
|
136 |
+
# Handle different poetry dependency formats
|
137 |
+
package = line.split('=')[0].strip()
|
138 |
+
# Remove quotes if present
|
139 |
+
package = package.strip('"\'')
|
140 |
+
|
141 |
+
# Skip python dependency
|
142 |
+
if package.lower() != 'python':
|
143 |
+
deps.add(package)
|
144 |
+
|
145 |
+
return list(deps)
|
146 |
+
|
147 |
+
async def _analyze_dependencies(self, files_content: Dict[str, str]) -> List[str]:
|
148 |
+
"""
|
149 |
+
Extract dependencies from requirement files
|
150 |
+
|
151 |
+
Args:
|
152 |
+
files_content: Dictionary of file contents
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
List of dependency names from all requirements files
|
156 |
+
"""
|
157 |
+
deps = set()
|
158 |
+
|
159 |
+
# Parse requirements.txt
|
160 |
+
if 'requirements.txt' in files_content:
|
161 |
+
for line in files_content['requirements.txt'].split('\n'):
|
162 |
+
if line and not line.startswith('#'):
|
163 |
+
package = line.split('==')[0].split('>=')[0].strip()
|
164 |
+
deps.add(package)
|
165 |
+
|
166 |
+
# Parse pyproject.toml
|
167 |
+
if 'pyproject.toml' in files_content:
|
168 |
+
content = files_content['pyproject.toml']
|
169 |
+
if '[tool.poetry.dependencies]' in content:
|
170 |
+
deps.update(self._parse_poetry_deps(content))
|
171 |
+
|
172 |
+
# Parse package.json
|
173 |
+
if 'package.json' in files_content:
|
174 |
+
try:
|
175 |
+
import json
|
176 |
+
pkg_json = json.loads(files_content['package.json'])
|
177 |
+
deps.update(pkg_json.get('dependencies', {}).keys())
|
178 |
+
deps.update(pkg_json.get('devDependencies', {}).keys())
|
179 |
+
except json.JSONDecodeError:
|
180 |
+
pass
|
181 |
+
|
182 |
+
return list(deps)
|
183 |
|
184 |
async def analyze_repository(
|
185 |
self,
|
|
|
187 |
category: str,
|
188 |
subcategory: str
|
189 |
) -> Dict[str, Any]:
|
190 |
+
"""
|
191 |
+
Analyze repository and generate comprehensive topics
|
192 |
+
|
193 |
+
Args:
|
194 |
+
repo_url: GitHub repository URL
|
195 |
+
category: Main category for topic classification
|
196 |
+
subcategory: Sub-category for topic classification
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
Dictionary containing analysis results including topics and dependencies
|
200 |
+
"""
|
201 |
try:
|
202 |
files_content = await self._fetch_core_files(repo_url)
|
203 |
if not files_content:
|