Spaces:

shimizukawa
/

python-no-senpai

Sleeping

python-no-senpai / loaders /github_issue.py

refactoring: move index annotation

8d5b271 about 2 years ago

1.8 kB

	import json
	from dataclasses import asdict
	from pathlib import Path
	from typing import Iterator

	from dateutil.parser import parse
	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader

	from models import GithubIssue


	def date_to_int(dt_str: str) -> int:
	dt = parse(dt_str)
	return int(dt.timestamp())


	def get_contents(inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
	with inputfile.open("r") as f:
	obj = [json.loads(line) for line in f]
	for data in obj:
	title = data["title"]
	body = data["body"]
	issue = GithubIssue(
	id=data["number"],
	title=title,
	ctime=date_to_int(data["created_at"]),
	user=data["user.login"],
	url=data["html_url"],
	labels=data["labels_"],
	)
	text = title
	if body:
	text += "\n\n" + body
	yield issue, text
	comments = data["comments_"]
	for comment in comments:
	issue = GithubIssue(
	id=comment["id"],
	title=data["title"],
	ctime=date_to_int(comment["created_at"]),
	user=comment["user.login"],
	url=comment["html_url"],
	labels=data["labels_"],
	type="issue_comment",
	)
	yield issue, comment["body"]


	class GithubIssueLoader(BaseLoader):
	def __init__(self, inputfile: Path):
	self.inputfile = inputfile

	def lazy_load(self) -> Iterator[Document]:
	for issue, text in get_contents(self.inputfile):
	metadata = asdict(issue)
	yield Document(page_content=text, metadata=metadata)

	def load(self) -> list[Document]:
	return list(self.lazy_load())