topic-generator / scripts /github_analyzer.py
Namgyu-Youn's picture
Upload 7 files
6f26938 verified
from typing import Dict, List, Optional, Any, Tuple
import aiohttp
from urllib.parse import urlparse
from .analyzer import TopicAnalyzer
from .error_handler import ErrorHandler
class GitHubAnalyzer:
"""
Analyzer for GitHub repositories that processes files and generates topics
"""
CORE_FILES = [
'README.md',
'requirements.txt',
'pyproject.toml',
'package.json',
'main.py',
'app.py',
'train.py'
]
def __init__(self):
"""Initialize the GitHubAnalyzer with base URL and required components"""
self.base_url = "https://raw.githubusercontent.com"
self.topic_analyzer = TopicAnalyzer()
self.error_handler = ErrorHandler()
def set_device(self, device: str):
"""
Set the device for the topic analyzer
Args:
device: Device to use ('cpu' or 'cuda')
"""
self.topic_analyzer.set_device(device)
def parse_github_url(self, url: str) -> Tuple[str, str, str]:
"""
Parse GitHub URL into components
Args:
url: GitHub repository URL
Returns:
Tuple containing (owner, repo, branch)
Raises:
ValueError: If URL format is invalid
"""
try:
parsed = urlparse(url)
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) < 2:
return self.error_handler.handle_github_url_error(
url,
"URL must contain owner and repository"
)
owner = path_parts[0]
repo = path_parts[1]
branch = "main" # default branch
return owner, repo, branch
except Exception as e:
return self.error_handler.handle_github_url_error(url, str(e))
async def _fetch_file(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""
Fetch a single file content from GitHub
Args:
session: aiohttp client session
url: URL of the file to fetch
Returns:
File content or None if fetch fails
"""
try:
async with session.get(url) as response:
if response.status == 200:
return await response.text()
return None
except Exception:
return None
async def _fetch_core_files(self, repo_url: str) -> Dict[str, str]:
"""
Fetch content of core files from repository
Args:
repo_url: GitHub repository URL
Returns:
Dictionary mapping filenames to their content
"""
owner, repo, branch = self.parse_github_url(repo_url)
files_content = {}
async with aiohttp.ClientSession() as session:
for file in self.CORE_FILES:
url = f"{self.base_url}/{owner}/{repo}/{branch}/{file}"
content = await self._fetch_file(session, url)
if content:
files_content[file] = content
return files_content
def _parse_poetry_deps(self, content: str) -> List[str]:
"""
Parse dependencies from pyproject.toml content
Args:
content: Content of pyproject.toml file
Returns:
List of dependency names
"""
deps = set()
in_deps_section = False
for line in content.split('\n'):
line = line.strip()
# Check if we're entering the dependencies section
if '[tool.poetry.dependencies]' in line:
in_deps_section = True
continue
# Check if we're exiting the dependencies section
if in_deps_section and line.startswith('['):
in_deps_section = False
continue
# Parse dependency line if we're in the dependencies section
if in_deps_section and '=' in line:
# Handle different poetry dependency formats
package = line.split('=')[0].strip()
# Remove quotes if present
package = package.strip('"\'')
# Skip python dependency
if package.lower() != 'python':
deps.add(package)
return list(deps)
async def _analyze_dependencies(self, files_content: Dict[str, str]) -> List[str]:
"""
Extract dependencies from requirement files
Args:
files_content: Dictionary of file contents
Returns:
List of dependency names from all requirements files
"""
deps = set()
# Parse requirements.txt
if 'requirements.txt' in files_content:
for line in files_content['requirements.txt'].split('\n'):
if line and not line.startswith('#'):
package = line.split('==')[0].split('>=')[0].strip()
deps.add(package)
# Parse pyproject.toml
if 'pyproject.toml' in files_content:
content = files_content['pyproject.toml']
if '[tool.poetry.dependencies]' in content:
deps.update(self._parse_poetry_deps(content))
# Parse package.json
if 'package.json' in files_content:
try:
import json
pkg_json = json.loads(files_content['package.json'])
deps.update(pkg_json.get('dependencies', {}).keys())
deps.update(pkg_json.get('devDependencies', {}).keys())
except json.JSONDecodeError:
pass
return list(deps)
async def analyze_repository(
self,
repo_url: str,
category: str,
subcategory: str
) -> Dict[str, Any]:
"""
Analyze repository and generate comprehensive topics
Args:
repo_url: GitHub repository URL
category: Main category for topic classification
subcategory: Sub-category for topic classification
Returns:
Dictionary containing analysis results including topics and dependencies
"""
try:
files_content = await self._fetch_core_files(repo_url)
if not files_content:
return self.error_handler.handle_file_fetch_error(
repo_url,
"No core files found"
)
# Analyze README content
readme_topics = []
if 'README.md' in files_content:
readme_topics = await self.topic_analyzer.generate_topics(
files_content['README.md'],
category,
subcategory
)
# Get dependencies
dependencies = await self._analyze_dependencies(files_content)
# Analyze Python files content
code_content = ""
for file in ['main.py', 'app.py', 'train.py']:
if file in files_content:
code_content += files_content[file] + "\n"
code_topics = []
if code_content:
code_topics = await self.topic_analyzer.generate_topics(
code_content,
category,
subcategory
)
return self.error_handler.success_response({
"readme_topics": readme_topics,
"code_topics": code_topics,
"dependencies": dependencies
})
except Exception as e:
return self.error_handler.handle_topic_analysis_error(
str(e),
{"repo_url": repo_url, "category": category, "subcategory": subcategory}
)