Spaces:
Sleeping
Sleeping
from typing import Dict, List, Optional, Any, Tuple | |
import aiohttp | |
from urllib.parse import urlparse | |
from .analyzer import TopicAnalyzer | |
from .error_handler import ErrorHandler | |
class GitHubAnalyzer: | |
""" | |
Analyzer for GitHub repositories that processes files and generates topics | |
""" | |
CORE_FILES = [ | |
'README.md', | |
'requirements.txt', | |
'pyproject.toml', | |
'package.json', | |
'main.py', | |
'app.py', | |
'train.py' | |
] | |
def __init__(self): | |
"""Initialize the GitHubAnalyzer with base URL and required components""" | |
self.base_url = "https://raw.githubusercontent.com" | |
self.topic_analyzer = TopicAnalyzer() | |
self.error_handler = ErrorHandler() | |
def set_device(self, device: str): | |
""" | |
Set the device for the topic analyzer | |
Args: | |
device: Device to use ('cpu' or 'cuda') | |
""" | |
self.topic_analyzer.set_device(device) | |
def parse_github_url(self, url: str) -> Tuple[str, str, str]: | |
""" | |
Parse GitHub URL into components | |
Args: | |
url: GitHub repository URL | |
Returns: | |
Tuple containing (owner, repo, branch) | |
Raises: | |
ValueError: If URL format is invalid | |
""" | |
try: | |
parsed = urlparse(url) | |
path_parts = parsed.path.strip("/").split("/") | |
if len(path_parts) < 2: | |
return self.error_handler.handle_github_url_error( | |
url, | |
"URL must contain owner and repository" | |
) | |
owner = path_parts[0] | |
repo = path_parts[1] | |
branch = "main" # default branch | |
return owner, repo, branch | |
except Exception as e: | |
return self.error_handler.handle_github_url_error(url, str(e)) | |
async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]: | |
""" | |
Fetch a single file content from GitHub | |
Args: | |
session: aiohttp client session | |
url: URL of the file to fetch | |
Returns: | |
File content or None if fetch fails | |
""" | |
try: | |
async with session.get(url) as response: | |
if response.status == 200: | |
return await response.text() | |
return None | |
except Exception: | |
return None | |
async def _fetch_core_files(self, repo_url: str) -> Dict[str, str]: | |
""" | |
Fetch content of core files from repository | |
Args: | |
repo_url: GitHub repository URL | |
Returns: | |
Dictionary mapping filenames to their content | |
""" | |
owner, repo, branch = self.parse_github_url(repo_url) | |
files_content = {} | |
async with aiohttp.ClientSession() as session: | |
for file in self.CORE_FILES: | |
url = f"{self.base_url}/{owner}/{repo}/{branch}/{file}" | |
content = await self._fetch_file(session, url) | |
if content: | |
files_content[file] = content | |
return files_content | |
def _parse_poetry_deps(self, content: str) -> List[str]: | |
""" | |
Parse dependencies from pyproject.toml content | |
Args: | |
content: Content of pyproject.toml file | |
Returns: | |
List of dependency names | |
""" | |
deps = set() | |
in_deps_section = False | |
for line in content.split('\n'): | |
line = line.strip() | |
# Check if we're entering the dependencies section | |
if '[tool.poetry.dependencies]' in line: | |
in_deps_section = True | |
continue | |
# Check if we're exiting the dependencies section | |
if in_deps_section and line.startswith('['): | |
in_deps_section = False | |
continue | |
# Parse dependency line if we're in the dependencies section | |
if in_deps_section and '=' in line: | |
# Handle different poetry dependency formats | |
package = line.split('=')[0].strip() | |
# Remove quotes if present | |
package = package.strip('"\'') | |
# Skip python dependency | |
if package.lower() != 'python': | |
deps.add(package) | |
return list(deps) | |
async def _analyze_dependencies(self, files_content: Dict[str, str]) -> List[str]: | |
""" | |
Extract dependencies from requirement files | |
Args: | |
files_content: Dictionary of file contents | |
Returns: | |
List of dependency names from all requirements files | |
""" | |
deps = set() | |
# Parse requirements.txt | |
if 'requirements.txt' in files_content: | |
for line in files_content['requirements.txt'].split('\n'): | |
if line and not line.startswith('#'): | |
package = line.split('==')[0].split('>=')[0].strip() | |
deps.add(package) | |
# Parse pyproject.toml | |
if 'pyproject.toml' in files_content: | |
content = files_content['pyproject.toml'] | |
if '[tool.poetry.dependencies]' in content: | |
deps.update(self._parse_poetry_deps(content)) | |
# Parse package.json | |
if 'package.json' in files_content: | |
try: | |
import json | |
pkg_json = json.loads(files_content['package.json']) | |
deps.update(pkg_json.get('dependencies', {}).keys()) | |
deps.update(pkg_json.get('devDependencies', {}).keys()) | |
except json.JSONDecodeError: | |
pass | |
return list(deps) | |
async def analyze_repository( | |
self, | |
repo_url: str, | |
category: str, | |
subcategory: str | |
) -> Dict[str, Any]: | |
""" | |
Analyze repository and generate comprehensive topics | |
Args: | |
repo_url: GitHub repository URL | |
category: Main category for topic classification | |
subcategory: Sub-category for topic classification | |
Returns: | |
Dictionary containing analysis results including topics and dependencies | |
""" | |
try: | |
files_content = await self._fetch_core_files(repo_url) | |
if not files_content: | |
return self.error_handler.handle_file_fetch_error( | |
repo_url, | |
"No core files found" | |
) | |
# Analyze README content | |
readme_topics = [] | |
if 'README.md' in files_content: | |
readme_topics = await self.topic_analyzer.generate_topics( | |
files_content['README.md'], | |
category, | |
subcategory | |
) | |
# Get dependencies | |
dependencies = await self._analyze_dependencies(files_content) | |
# Analyze Python files content | |
code_content = "" | |
for file in ['main.py', 'app.py', 'train.py']: | |
if file in files_content: | |
code_content += files_content[file] + "\n" | |
code_topics = [] | |
if code_content: | |
code_topics = await self.topic_analyzer.generate_topics( | |
code_content, | |
category, | |
subcategory | |
) | |
return self.error_handler.success_response({ | |
"readme_topics": readme_topics, | |
"code_topics": code_topics, | |
"dependencies": dependencies | |
}) | |
except Exception as e: | |
return self.error_handler.handle_topic_analysis_error( | |
str(e), | |
{"repo_url": repo_url, "category": category, "subcategory": subcategory} | |
) |