WANG Ning
commited on
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
base_model:
|
6 |
+
- WANGNingroci/VeriSeek
|
7 |
+
---
|
8 |
+
|
9 |
+
Usage:
|
10 |
+
|
11 |
+
```python
|
12 |
+
import re
|
13 |
+
import json
|
14 |
+
import numpy as np
|
15 |
+
from src.gritlm import GritLM
|
16 |
+
from scipy.spatial.distance import cosine
|
17 |
+
|
18 |
+
KEY_WORDS = ['endmodule', 'end', 'endcase', 'else', 'begin']
|
19 |
+
REP_QUERY = 'Represent this text: '
|
20 |
+
LINE_QUERY = """Now you are a verilog designer. You are given the design description and buggy verilog code segment. Infer the bug type in the code segment."""
|
21 |
+
TYPE_QUERY = "Now you are a verilog designer. You are given the design description and buggy verilog code segment. Infer the bug type in the code segment."
|
22 |
+
CLS_QUERY = 'Now you are a verilog designer. You are given the design description and buggy verilog code segment. Infer the bug type in the code segment.\n'
|
23 |
+
CLS_DESC = 'The bug type is '
|
24 |
+
BUG_CLS = {
|
25 |
+
'width': 0, 'logic': 0, 'assignment': 0, 'initial': 0, 'data': 0,
|
26 |
+
'state': 0, 'others': 0, 'comparison': 0, 'bitwise': 0, 'condition': 0,
|
27 |
+
'signal': 0, 'arithmetic': 0, 'value': 0
|
28 |
+
}
|
29 |
+
BUG_DESC = {
|
30 |
+
'width': 'Mismatched bit widths in assignments, operations, or port connections, leading to unintended truncation or zero-extension.',
|
31 |
+
'logic': 'Errors in combinational or sequential logic design, resulting in incorrect circuit behavior or timing issues.',
|
32 |
+
'assignment': 'Improper use of blocking (=) or non-blocking (<=) assignments, causing race conditions or unexpected signal updates.',
|
33 |
+
'initial': 'Incorrect initialization of variables or registers, leading to undefined behavior or simulation mismatches.',
|
34 |
+
'data': 'Errors in data handling, such as incorrect data types, improper conversions, or misuse of signed/unsigned values.',
|
35 |
+
'state': 'Flaws in finite state machine (FSM) design, including missing states, incorrect transitions, or improper state encoding.',
|
36 |
+
'others': 'Miscellaneous errors that don\'t fit into other categories, such as syntax errors or tool-specific issues.',
|
37 |
+
'comparison': 'Incorrect use of equality (==) or inequality (!=) operators, or misuse of case equality (===) and case inequality (!==).',
|
38 |
+
'bitwise': 'Errors in bitwise operations, including incorrect use of AND, OR, XOR, or shift operators.',
|
39 |
+
'condition': 'Flaws in conditional statements (if-else, case) leading to incorrect branching or priority encoding issues.',
|
40 |
+
'signal': 'Errors related to signal declarations, including incorrect use of wire/reg, input/output ports, or signal naming conflicts.',
|
41 |
+
'arithmetic': 'Mistakes in arithmetic operations, such as overflow/underflow issues or incorrect use of signed/unsigned arithmetic.',
|
42 |
+
'value': 'Incorrect constant values, parameter definitions, or literal representations leading to unexpected circuit behavior.'
|
43 |
+
}
|
44 |
+
GEN_INST = 'Now you are a verilog designer. You need to fix the bug in the buggy code segment:\n'
|
45 |
+
SPEC = """
|
46 |
+
---
|
47 |
+
### Module Specification: Cfu
|
48 |
+
# #### 1. Overview
|
49 |
+
# The `Cfu` (Custom Function Unit) module is designed to perform a simple selection operation based on the input command. It processes two 32-bit inputs and outputs one of them based on the least significant bit of the function ID. The module operates synchronously with a clock signal and uses a simple handshake protocol for command acceptance and response delivery.
|
50 |
+
# #### 2. Interface Description
|
51 |
+
# ##### Inputs:
|
52 |
+
# - **cmd_valid** (`input`): A signal indicating if the command inputs are valid.
|
53 |
+
# - **cmd_payload_function_id** (`input [9:0]`): A 10-bit function identifier which determines the operation of the module. Currently, only the LSB is used for selecting the output.
|
54 |
+
# - **cmd_payload_inputs_0** (`input [31:0]`): A 32-bit input data.
|
55 |
+
# - **cmd_payload_inputs_1** (`input [31:0]`): Another 32-bit input data.
|
56 |
+
# - **rsp_ready** (`input`): A signal from the downstream component indicating it is ready to accept the response.
|
57 |
+
# - **reset** (`input`): Asynchronous reset signal.
|
58 |
+
# - **clk** (`input`): Clock signal.
|
59 |
+
# ##### Outputs:
|
60 |
+
# - **cmd_ready** (`output`): A signal indicating the module is ready to accept a command.
|
61 |
+
- **rsp_valid** (`output`): A signal indicating that the response is valid and ready to be read.
|
62 |
+
# - **rsp_payload_outputs_0** (`output [31:0]`): The 32-bit output data, which is one of the two input data values based on the function ID.
|
63 |
+
#### 3. Functional Description
|
64 |
+
##### Command and Response Protocol:
|
65 |
+
- **Handshake Mechanism**: The module uses a simple handshake mechanism for command acceptance and response delivery. The `cmd_ready` signal is asserted when the module is ready to accept a new command, which depends on the `rsp_ready` signal. The `rsp_valid` signal is asserted when the module has a valid response ready, which is directly tied to the `cmd_valid` signal.
|
66 |
+
|
67 |
+
##### Data Processing:
|
68 |
+
- **Output Selection**: The output, `rsp_payload_outputs_0`, is selected based on the least significant bit (LSB) of `cmd_payload_function_id`. If the LSB is 0, `rsp_payload_outputs_0` is equal to `cmd_payload_inputs_0`. If the LSB is 1, `rsp_payload_outputs_0` is equal to `cmd_payload_inputs_1`.
|
69 |
+
#### 4. Timing and Synchronization
|
70 |
+
- The module operates synchronously with respect to the provided clock signal (`clk`). All inputs are sampled, and outputs are updated on the rising edge of the clock.
|
71 |
+
- The reset (`reset`) is asynchronous and active-high, which means all internal states and outputs are reset when `reset` is asserted, regardless of the clock.
|
72 |
+
#### 5. Use Cases
|
73 |
+
- **Simple Data Selector**: This module can be used in systems where conditional data forwarding is needed based on a simple configuration or status bit.
|
74 |
+
# - **Function ID Expansion**: While currently only the LSB of the function ID is used, the module can be expanded to use more bits for more complex selection logic or operations.
|
75 |
+
#### 6. Limitations and Future Enhancements
|
76 |
+
# - **Function ID Utilization**: Currently, only the LSB of the function ID is used. Future enhancements could include decoding more bits to perform different operations.
|
77 |
+
- **Pipeline Stages**: The module is purely combinational regarding the data path. Adding pipeline stages could help in meeting timing requirements for higher clock frequencies.
|
78 |
+
---
|
79 |
+
This specification provides a detailed overview of the `Cfu` module's functionality, interface, and behavior based on the provided Verilog code. It outlines the basic operation, use cases, and potential areas for future enhancements.
|
80 |
+
|
81 |
+
Buggy code:
|
82 |
+
|
83 |
+
"""
|
84 |
+
BUGGY = """
|
85 |
+
module Cfu (
|
86 |
+
input cmd_valid,
|
87 |
+
output cmd_ready,
|
88 |
+
input [9:0] cmd_payload_function_id,
|
89 |
+
input [31:0] cmd_payload_inputs_0,
|
90 |
+
input [31:0] cmd_payload_inputs_1,
|
91 |
+
output rsp_valid,
|
92 |
+
input rsp_ready,
|
93 |
+
output [31:0] rsp_payload_outputs_0,
|
94 |
+
input reset,
|
95 |
+
input clk
|
96 |
+
);
|
97 |
+
// Trivial handshaking for a combinational CFU
|
98 |
+
assign rsp_valid = cmd_valid;
|
99 |
+
assign cmd_ready = rsp_ready | cmd_valid;
|
100 |
+
//
|
101 |
+
// select output -- note that we're not fully decoding the 3 function_id bits
|
102 |
+
//
|
103 |
+
assign rsp_payload_outputs_0 = cmd_payload_function_id[0] ?
|
104 |
+
cmd_payload_inputs_1 :
|
105 |
+
cmd_payload_inputs_0 ;
|
106 |
+
endmodule
|
107 |
+
"""
|
108 |
+
JSON_FORMAT = '{"buggy_code": "The buggy code in the systemverilog (just one line of code)", "correct_code": "The correct code (just one line of code that can directly replace the buggy code, without any other description)"}'
|
109 |
+
BUGGY_LINE_GT = "assign cmd_ready = rsp_ready | cmd_valid;"
|
110 |
+
BUGGY_CLS_GT = "logic"
|
111 |
+
FIX_GT = "assign cmd_ready = rsp_ready;"
|
112 |
+
|
113 |
+
|
114 |
+
def gen_neg(buggy_code):
|
115 |
+
buggy_code_lines = buggy_code.split('\n')
|
116 |
+
buggy_code_lines = [line.strip() for line in buggy_code_lines]
|
117 |
+
buggy_code_lines = [line.strip('\t') for line in buggy_code_lines]
|
118 |
+
buggy_code_lines = [line.strip('\r') for line in buggy_code_lines]
|
119 |
+
buggy_code_lines = [line for line in buggy_code_lines if len(line) > 0]
|
120 |
+
# remove comments
|
121 |
+
buggy_code_lines_neg = [
|
122 |
+
line for line in buggy_code_lines if not line.startswith('//') and not line.startswith('*') and not line.startswith('/*') and line not in KEY_WORDS]
|
123 |
+
# remove not useful lines
|
124 |
+
buggy_code_lines_neg = [
|
125 |
+
line for line in buggy_code_lines_neg if ' ' in line and len(line.replace(' ', '')) > 4]
|
126 |
+
|
127 |
+
return buggy_code_lines, buggy_code_lines_neg
|
128 |
+
|
129 |
+
|
130 |
+
def gritlm_instruction(instruction):
|
131 |
+
return "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
|
132 |
+
|
133 |
+
|
134 |
+
def extract_bug_types(text):
|
135 |
+
# Define the regex pattern
|
136 |
+
pattern = r'The bug type is (\w+)'
|
137 |
+
|
138 |
+
# Find all matches
|
139 |
+
matches = re.findall(pattern, text)
|
140 |
+
|
141 |
+
return matches
|
142 |
+
|
143 |
+
# Loads the model for both capabilities; If you only need embedding pass `mode="embedding"` to save memory (no lm head)
|
144 |
+
model_path = "./VeriDebug"
|
145 |
+
model = GritLM(model_path, torch_dtype="auto", mode="unified")
|
146 |
+
print(f"Model loaded from {model_path}")
|
147 |
+
|
148 |
+
### Embedding/Representation ###
|
149 |
+
# buggy line location
|
150 |
+
_, buggy_code_lines = gen_neg(BUGGY)
|
151 |
+
query = [LINE_QUERY + "\n" + SPEC + "\n" + BUGGY]
|
152 |
+
q_rep = model.encode(query,
|
153 |
+
instruction=gritlm_instruction("Represent this text:"), max_length=4096)
|
154 |
+
d_rep = model.encode(buggy_code_lines,
|
155 |
+
instruction=gritlm_instruction(""),
|
156 |
+
max_length=128)
|
157 |
+
cosine_sim = [1 - cosine(q_rep[0], d) for d in d_rep]
|
158 |
+
sim_rank = np.argsort(cosine_sim)[::-1]
|
159 |
+
buggy_code_lines_ranked = [buggy_code_lines[i] for i in sim_rank]
|
160 |
+
print("========== Buggy code lines ranked by similarity ==========")
|
161 |
+
print(f"Buggy code lines candidates (ranked by similarity): \n{buggy_code_lines_ranked}")
|
162 |
+
print("----------------------------------------")
|
163 |
+
print(f"Ground truth: {BUGGY_LINE_GT}")
|
164 |
+
print("===========================================================")
|
165 |
+
|
166 |
+
# buggy type classification
|
167 |
+
query = [TYPE_QUERY + "\n" + SPEC + "\n" + BUGGY]
|
168 |
+
d_types = [CLS_DESC + b + '.' + BUG_DESC[b]
|
169 |
+
for b in BUG_CLS.keys()]
|
170 |
+
q_rep = model.encode(query,
|
171 |
+
instruction=gritlm_instruction(REP_QUERY),
|
172 |
+
max_length=4096)
|
173 |
+
d_rep = model.encode(d_types,
|
174 |
+
instruction=gritlm_instruction(""),
|
175 |
+
max_length=128)
|
176 |
+
cosine_sim = [1 - cosine(q_rep[0], d) for d in d_rep]
|
177 |
+
sim_rank = np.argsort(cosine_sim)[::-1]
|
178 |
+
buggy_type_ranked = [d_types[i] for i in sim_rank]
|
179 |
+
print("============ Buggy type ranked by similarity ==============")
|
180 |
+
print(f"Buggy type candidates (ranked by similarity): \n{[extract_bug_types(i)[0] for i in buggy_type_ranked]}")
|
181 |
+
print("----------------------------------------")
|
182 |
+
print(f"Ground truth: {BUGGY_CLS_GT}")
|
183 |
+
print("===========================================================")
|
184 |
+
|
185 |
+
|
186 |
+
### Generation ###
|
187 |
+
instruct = f'{GEN_INST}{BUGGY}\n\nThe specification file of this code is:\n{SPEC}\n\nThe possible buggy lines ranking list are:\n{buggy_code_lines_ranked}\n\nThe possible bug type ranking list are:\n{buggy_type_ranked}\n\nYour task is to return me a json to analyze how the code should be modified, in the following format:\n{JSON_FORMAT}.'
|
188 |
+
messages = [
|
189 |
+
{"role": "user", "content": instruct},
|
190 |
+
]
|
191 |
+
encoded = model.tokenizer.apply_chat_template(
|
192 |
+
messages, add_generation_prompt=True, return_tensors="pt")
|
193 |
+
encoded = encoded.to(model.device)
|
194 |
+
gen = model.generate(encoded, max_new_tokens=256, do_sample=True)
|
195 |
+
valid_gen = gen[:, encoded.shape[1]:]
|
196 |
+
decoded = model.tokenizer.batch_decode(valid_gen)
|
197 |
+
# truncate the decoded text before </s>
|
198 |
+
decoded = [d[:d.find('}')+1] for d in decoded]
|
199 |
+
decoded_dict = json.loads(decoded[0])
|
200 |
+
print("==================== Buggy fix ============================")
|
201 |
+
print(f"Fix result: {decoded_dict}")
|
202 |
+
print("----------------------------------------")
|
203 |
+
print(f"Ground truth: {FIX_GT}")
|
204 |
+
print("===========================================================")
|
205 |
+
|
206 |
+
```
|