Namgyu-Youn commited on
Commit
6f26938
·
verified ·
1 Parent(s): 76f9b47

Upload 7 files

Browse files
Files changed (1) hide show
  1. scripts/github_analyzer.py +149 -11
scripts/github_analyzer.py CHANGED
@@ -1,10 +1,13 @@
1
- from typing import Dict, List, Optional, Any
2
  import aiohttp
3
  from urllib.parse import urlparse
4
  from .analyzer import TopicAnalyzer
5
  from .error_handler import ErrorHandler
6
 
7
  class GitHubAnalyzer:
 
 
 
8
  CORE_FILES = [
9
  'README.md',
10
  'requirements.txt',
@@ -16,12 +19,33 @@ class GitHubAnalyzer:
16
  ]
17
 
18
  def __init__(self):
 
19
  self.base_url = "https://raw.githubusercontent.com"
20
  self.topic_analyzer = TopicAnalyzer()
21
  self.error_handler = ErrorHandler()
22
 
23
- def parse_github_url(self, url: str) -> tuple[str, str, str]:
24
- """Parse GitHub URL into components."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
  parsed = urlparse(url)
27
  path_parts = parsed.path.strip("/").split("/")
@@ -41,17 +65,121 @@ class GitHubAnalyzer:
41
  return self.error_handler.handle_github_url_error(url, str(e))
42
 
43
  async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
44
- """Fetch a single file content."""
 
 
 
 
 
 
 
 
 
45
  try:
46
  async with session.get(url) as response:
47
  if response.status == 200:
48
  return await response.text()
49
- return self.error_handler.handle_file_fetch_error(
50
- url,
51
- f"HTTP {response.status}"
52
- )
53
- except Exception as e:
54
- return self.error_handler.handle_file_fetch_error(url, str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  async def analyze_repository(
57
  self,
@@ -59,7 +187,17 @@ class GitHubAnalyzer:
59
  category: str,
60
  subcategory: str
61
  ) -> Dict[str, Any]:
62
- """Analyze repository and generate comprehensive topics."""
 
 
 
 
 
 
 
 
 
 
63
  try:
64
  files_content = await self._fetch_core_files(repo_url)
65
  if not files_content:
 
1
+ from typing import Dict, List, Optional, Any, Tuple
2
  import aiohttp
3
  from urllib.parse import urlparse
4
  from .analyzer import TopicAnalyzer
5
  from .error_handler import ErrorHandler
6
 
7
  class GitHubAnalyzer:
8
+ """
9
+ Analyzer for GitHub repositories that processes files and generates topics
10
+ """
11
  CORE_FILES = [
12
  'README.md',
13
  'requirements.txt',
 
19
  ]
20
 
21
  def __init__(self):
22
+ """Initialize the GitHubAnalyzer with base URL and required components"""
23
  self.base_url = "https://raw.githubusercontent.com"
24
  self.topic_analyzer = TopicAnalyzer()
25
  self.error_handler = ErrorHandler()
26
 
27
+ def set_device(self, device: str):
28
+ """
29
+ Set the device for the topic analyzer
30
+
31
+ Args:
32
+ device: Device to use ('cpu' or 'cuda')
33
+ """
34
+ self.topic_analyzer.set_device(device)
35
+
36
+ def parse_github_url(self, url: str) -> Tuple[str, str, str]:
37
+ """
38
+ Parse GitHub URL into components
39
+
40
+ Args:
41
+ url: GitHub repository URL
42
+
43
+ Returns:
44
+ Tuple containing (owner, repo, branch)
45
+
46
+ Raises:
47
+ ValueError: If URL format is invalid
48
+ """
49
  try:
50
  parsed = urlparse(url)
51
  path_parts = parsed.path.strip("/").split("/")
 
65
  return self.error_handler.handle_github_url_error(url, str(e))
66
 
67
  async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
68
+ """
69
+ Fetch a single file content from GitHub
70
+
71
+ Args:
72
+ session: aiohttp client session
73
+ url: URL of the file to fetch
74
+
75
+ Returns:
76
+ File content or None if fetch fails
77
+ """
78
  try:
79
  async with session.get(url) as response:
80
  if response.status == 200:
81
  return await response.text()
82
+ return None
83
+ except Exception:
84
+ return None
85
+
86
+ async def _fetch_core_files(self, repo_url: str) -> Dict[str, str]:
87
+ """
88
+ Fetch content of core files from repository
89
+
90
+ Args:
91
+ repo_url: GitHub repository URL
92
+
93
+ Returns:
94
+ Dictionary mapping filenames to their content
95
+ """
96
+ owner, repo, branch = self.parse_github_url(repo_url)
97
+ files_content = {}
98
+
99
+ async with aiohttp.ClientSession() as session:
100
+ for file in self.CORE_FILES:
101
+ url = f"{self.base_url}/{owner}/{repo}/{branch}/{file}"
102
+ content = await self._fetch_file(session, url)
103
+ if content:
104
+ files_content[file] = content
105
+
106
+ return files_content
107
+
108
+ def _parse_poetry_deps(self, content: str) -> List[str]:
109
+ """
110
+ Parse dependencies from pyproject.toml content
111
+
112
+ Args:
113
+ content: Content of pyproject.toml file
114
+
115
+ Returns:
116
+ List of dependency names
117
+ """
118
+ deps = set()
119
+ in_deps_section = False
120
+
121
+ for line in content.split('\n'):
122
+ line = line.strip()
123
+
124
+ # Check if we're entering the dependencies section
125
+ if '[tool.poetry.dependencies]' in line:
126
+ in_deps_section = True
127
+ continue
128
+
129
+ # Check if we're exiting the dependencies section
130
+ if in_deps_section and line.startswith('['):
131
+ in_deps_section = False
132
+ continue
133
+
134
+ # Parse dependency line if we're in the dependencies section
135
+ if in_deps_section and '=' in line:
136
+ # Handle different poetry dependency formats
137
+ package = line.split('=')[0].strip()
138
+ # Remove quotes if present
139
+ package = package.strip('"\'')
140
+
141
+ # Skip python dependency
142
+ if package.lower() != 'python':
143
+ deps.add(package)
144
+
145
+ return list(deps)
146
+
147
+ async def _analyze_dependencies(self, files_content: Dict[str, str]) -> List[str]:
148
+ """
149
+ Extract dependencies from requirement files
150
+
151
+ Args:
152
+ files_content: Dictionary of file contents
153
+
154
+ Returns:
155
+ List of dependency names from all requirements files
156
+ """
157
+ deps = set()
158
+
159
+ # Parse requirements.txt
160
+ if 'requirements.txt' in files_content:
161
+ for line in files_content['requirements.txt'].split('\n'):
162
+ if line and not line.startswith('#'):
163
+ package = line.split('==')[0].split('>=')[0].strip()
164
+ deps.add(package)
165
+
166
+ # Parse pyproject.toml
167
+ if 'pyproject.toml' in files_content:
168
+ content = files_content['pyproject.toml']
169
+ if '[tool.poetry.dependencies]' in content:
170
+ deps.update(self._parse_poetry_deps(content))
171
+
172
+ # Parse package.json
173
+ if 'package.json' in files_content:
174
+ try:
175
+ import json
176
+ pkg_json = json.loads(files_content['package.json'])
177
+ deps.update(pkg_json.get('dependencies', {}).keys())
178
+ deps.update(pkg_json.get('devDependencies', {}).keys())
179
+ except json.JSONDecodeError:
180
+ pass
181
+
182
+ return list(deps)
183
 
184
  async def analyze_repository(
185
  self,
 
187
  category: str,
188
  subcategory: str
189
  ) -> Dict[str, Any]:
190
+ """
191
+ Analyze repository and generate comprehensive topics
192
+
193
+ Args:
194
+ repo_url: GitHub repository URL
195
+ category: Main category for topic classification
196
+ subcategory: Sub-category for topic classification
197
+
198
+ Returns:
199
+ Dictionary containing analysis results including topics and dependencies
200
+ """
201
  try:
202
  files_content = await self._fetch_core_files(repo_url)
203
  if not files_content: