File size: 8,598 Bytes
2b7da55
8259bb2
2b7da55
65ad43e
2b7da55
65ad43e
c9893a4
2b7da55
53a498b
 
 
 
 
 
 
4bebae2
0a58342
 
 
 
 
 
 
 
 
7f8afa2
38a02be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f8afa2
 
556b6f8
 
7a62924
7f8afa2
38a02be
 
 
 
 
 
 
 
7f8afa2
38a02be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f8afa2
a318b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d9fac4
bed30c4
9d08bfc
a318b59
8d9fac4
9d08bfc
a5d535f
 
 
bed30c4
a318b59
8d9fac4
9d08bfc
a318b59
 
38a02be
9e410e3
7f8afa2
a5ef20b
 
 
 
7f8afa2
38a02be
0a58342
7f8afa2
4bebae2
53a498b
4bebae2
 
7f8afa2
0a58342
4f6c52f
38a02be
 
7a98a8f
4f6c52f
7a62924
5b0c860
7a62924
 
7a98a8f
5b0c860
e5e3262
706fb90
85aa77a
e5e3262
85aa77a
3aaf827
 
 
a318b59
475ca95
057e421
7a98a8f
7a62924
2b7da55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import json
import gradio as gr
import requests as req

code_nl = "function for db connection"

CT5_URL = "https://api-inference.huggingface.co/models/stmnk/codet5-small-code-summarization-python"
CT5_METHOD = 'POST'
API_URL = CT5_URL
headers = {"Authorization": "Bearer api_UhCKXKyqxJOpOcbvrZurQFqmVNZRTtxVfl"}

def query(payload):
	response = req.post(API_URL, headers=headers, json=payload)
	return response.json()


dfs_code = r"""
def dfs(visited, graph, node):  #function for dfs 
    if node not in visited:
        print (node)
        visited.add(node)
        for neighbour in graph[node]:
            dfs(visited, graph, neighbour)
"""

function_code = r"""
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None,
                        batch_size: int = 10_000, duplicate_documents: Optional[str] = None):

        if index and not self.client.indices.exists(index=index):
            self._create_document_index(index)

        if index is None:
            index = self.index
        duplicate_documents = duplicate_documents or self.duplicate_documents
        assert duplicate_documents in self.duplicate_documents_options, \
            f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}"

        field_map = self._create_document_field_map()
        document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
        document_objects = self._handle_duplicate_documents(documents=document_objects,
                                                            index=index,
                                                            duplicate_documents=duplicate_documents)
        documents_to_index = []
        for doc in document_objects:
            _doc = {
                "_op_type": "index" if duplicate_documents == 'overwrite' else "create",
                "_index": index,
                **doc.to_dict(field_map=self._create_document_field_map())
            }  # type: Dict[str, Any]

            # cast embedding type as ES cannot deal with np.array
            if _doc[self.embedding_field] is not None:
                if type(_doc[self.embedding_field]) == np.ndarray:
                    _doc[self.embedding_field] = _doc[self.embedding_field].tolist()

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("score", None)
            _doc = {k:v for k,v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)

            # Pass batch_size number of documents to bulk
            if len(documents_to_index) % batch_size == 0:
                bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
                documents_to_index = []

        if documents_to_index:
            bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)

"""

task_code = f' Summarize Python: {function_code}'
# task_code = f' Summarize Python: {dfs_code}'

real_docstring = r"""
        Indexes documents for later queries in Elasticsearch.

        Behaviour if a document with the same ID already exists in ElasticSearch:
        a) (Default) Throw Elastic's standard error message for duplicate IDs.
        b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
        (This is only relevant if you pass your own ID when initializing a `Document`.
        If don't set custom IDs for your Documents or just pass a list of dictionaries here,
        they will automatically get UUIDs assigned. See the `Document` class for details)

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"content": "<the-actual-text>"}.
                          Optionally: Include meta data via {"content": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.content_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :param batch_size: Number of documents that are passed to Elasticsearch's bulk function at a time.
        :param duplicate_documents: Handle duplicates document based on parameter options.
                                    Parameter options : ( 'skip','overwrite','fail')
                                    skip: Ignore the duplicates documents
                                    overwrite: Update any existing documents with the same ID when adding documents.
                                    fail: an error is raised if the document ID of the document being added already
                                    exists.
        :raises DuplicateDocumentError: Exception trigger on duplicate document
        :return: None
"""

tree_code = r"""
class Tree:
    def __init__(self):
        self.val = None
        self.left = None
        self.right = None    
"""

insert_code = r"""
def insert(self, val):
    if self.val:
        if val < self.val:
            if self.left is None:
                self.left = Tree(val)
            else:
                self.left.insert(val)
        elif val > self.val:
            if self.right is None:
                self.right = Tree(val)
            else:
                self.right.insert(val)
    else:
        self.val = val
"""

display_code = r"""
def display_tree(self):
    current_node = self.val
    
    if self.left:
        self.left.display_tree()
    
    print(
        f'value: {current_node}'
    )
    
    if self.right:
        self.right.display_tree()
    
"""
                        
def docgen_func(function_code):
    req_data = {"inputs": function_code}
    output = query(req_data)
    if type(output) is list:
        return f'"""\n{output[0]["generated_text"]}\n"""'
    else:
        return str(output)

def pygen_func(nl_code_intent):
    pass # TODO: generate code PL from intent NL + search in corpus
    # inputs = {'code_nl': code_nl}    
    # payload = json.dumps(inputs)
    # prediction = req.request(CT5_METHOD, CT5_URL, data=payload)
    # prediction = req.request(CT5_METHOD, CT5_URL, json=req_data)
    # answer = json.loads(prediction.content.decode("utf-8"))
    # return str(answer)
    # CT5_URL = "https://api-inference.huggingface.co/models/nielsr/codet5-small-code-summarization-ruby"

iface = gr.Interface(
    # pygen_func,
    docgen_func,
    [
        # gr.inputs.Textbox(lines=7, label="Code Intent (NL)", default=task_code),
        gr.inputs.Textbox(lines=10, label="Enter Task + Code in Python (PL)", default=task_code),  
    ],
    # gr.outputs.Textbox(label="Code Generated PL")) 
    gr.outputs.Textbox(label="Docstring Generated (NL)"),
    title='Generate a documentation string for Python code',
    description='The application takes as input the python code for a function, or a class, and generates a documentation string, or code comment, for it using codeT5 fine tuned for code2text generation. Code to text generation, or code summarization, is a CodeXGLUE generation, or sequence to sequence, downstream task. CodeXGLUE stands for General Language Understanding Evaluation benchmark *for code*, which includes diversified code intelligence downstream inference tasks and datasets.',
    article=r"""CodeXGLLUE task definition (and dataset): **Code summarization (CodeSearchNet)**: 
    
_A model is given the task to generate natural language comments for a programming language code input._
    
For further details, see the [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) benchmark dataset and open challenge for code intelligence. 
""",
    theme='grass',
    examples=[[tree_code],[insert_code],[display_code]],
    verbose=True,
    # show_tips=True
)
    
iface.launch(share=True)