Spaces:

stmnk
/

pygen

Runtime error

App Files Files Community

pygen / strings.py

stmnk

Update strings.py

016d7aa over 2 years ago

raw

history blame

7.27 kB

	def pygen_func(nl_code_intent):
	pass # TODO: generate code PL from intent NL + search in corpus
	# inputs = {'code_nl': code_nl}
	# payload = json.dumps(inputs)
	# prediction = req.request(CT5_METHOD, CT5_URL, data=payload)
	# prediction = req.request(CT5_METHOD, CT5_URL, json=req_data)
	# answer = json.loads(prediction.content.decode("utf-8"))
	# return str(answer)
	# CT5_URL = "https://api-inference.huggingface.co/models/nielsr/codet5-small-code-summarization-ruby"

	dfs_code = r"""
	def dfs(visited, graph, node): #function for dfs
	if node not in visited:
	print (node)
	visited.add(node)
	for neighbour in graph[node]:
	dfs(visited, graph, neighbour)
	"""

	function_code = r"""
	def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None,
	batch_size: int = 10_000, duplicate_documents: Optional[str] = None):

	if index and not self.client.indices.exists(index=index):
	self._create_document_index(index)

	if index is None:
	index = self.index
	duplicate_documents = duplicate_documents or self.duplicate_documents
	assert duplicate_documents in self.duplicate_documents_options,
	f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}"

	field_map = self._create_document_field_map()
	document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
	document_objects = self._handle_duplicate_documents(documents=document_objects,
	index=index,
	duplicate_documents=duplicate_documents)
	documents_to_index = []
	for doc in document_objects:
	_doc = {
	"_op_type": "index" if duplicate_documents == 'overwrite' else "create",
	"_index": index,
	**doc.to_dict(field_map=self._create_document_field_map())
	} # type: Dict[str, Any]

	# cast embedding type as ES cannot deal with np.array
	if _doc[self.embedding_field] is not None:
	if type(_doc[self.embedding_field]) == np.ndarray:
	_doc[self.embedding_field] = _doc[self.embedding_field].tolist()

	# rename id for elastic
	_doc["_id"] = str(_doc.pop("id"))

	# don't index query score and empty fields
	_ = _doc.pop("score", None)
	_doc = {k:v for k,v in _doc.items() if v is not None}

	# In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
	# we "unnest" all value within "meta"
	if "meta" in _doc.keys():
	for k, v in _doc["meta"].items():
	_doc[k] = v
	_doc.pop("meta")
	documents_to_index.append(_doc)

	# Pass batch_size number of documents to bulk
	if len(documents_to_index) % batch_size == 0:
	bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
	documents_to_index = []

	if documents_to_index:
	bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)

	"""

	real_docstring = r"""
	Indexes documents for later queries in Elasticsearch.

	Behaviour if a document with the same ID already exists in ElasticSearch:
	a) (Default) Throw Elastic's standard error message for duplicate IDs.
	b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
	(This is only relevant if you pass your own ID when initializing a `Document`.
	If don't set custom IDs for your Documents or just pass a list of dictionaries here,
	they will automatically get UUIDs assigned. See the `Document` class for details)

	:param documents: a list of Python dictionaries or a list of Haystack Document objects.
	For documents as dictionaries, the format is {"content": "<the-actual-text>"}.
	Optionally: Include meta data via {"content": "<the-actual-text>",
	"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
	It can be used for filtering and is accessible in the responses of the Finder.
	Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
	should be changed to what you have set for self.content_field and self.name_field.
	:param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
	:param batch_size: Number of documents that are passed to Elasticsearch's bulk function at a time.
	:param duplicate_documents: Handle duplicates document based on parameter options.
	Parameter options : ( 'skip','overwrite','fail')
	skip: Ignore the duplicates documents
	overwrite: Update any existing documents with the same ID when adding documents.
	fail: an error is raised if the document ID of the document being added already
	exists.
	:raises DuplicateDocumentError: Exception trigger on duplicate document
	:return: None
	"""

	tree_code = r"""
	class Tree:
	def __init__(self):
	self.val = None
	self.left = None
	self.right = None
	"""

	insert_code = r"""
	def insert(self, val):
	if self.val:
	if val < self.val:
	if self.left is None:
	self.left = Tree(val)
	else:
	self.left.insert(val)
	elif val > self.val:
	if self.right is None:
	self.right = Tree(val)
	else:
	self.right.insert(val)
	else:
	self.val = val
	"""

	display_code = r"""
	def display_tree(self: Tree, prefix='value: '):
	current_node = self.val

	if self.left:
	self.left.display_tree()

	print(prefix, current_node)

	if self.right:
	self.right.display_tree()

	"""

	article_string = r"""CodeXGLLUE task definition (and dataset): Code summarization (CodeSearchNet):

	_A model is given the task to generate natural language comments for a programming language code input._

	For further details, see the [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) benchmark dataset and open challenge for code intelligence.
	"""

	descr_string = 'The application takes as input the python code for a function, or a class, and generates a documentation string, or code comment, for it using codeT5 fine tuned for code2text generation. Code to text generation, or code summarization, is a CodeXGLUE generation, or sequence to sequence, downstream task. CodeXGLUE stands for General Language Understanding Evaluation benchmark for code, which includes diversified code intelligence downstream inference tasks and datasets.'