wwtlcczwj commited on
Commit
c860b45
·
verified ·
1 Parent(s): 070db5e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +70 -70
README.md CHANGED
@@ -152,93 +152,93 @@ def _load_model_tokenizer(checkpoint_path, cpu_only):
152
 
153
  return model, tokenizer
154
 
155
- def _get_input() -> str:
156
- while True:
157
- try:
158
- message = input('User: ').strip()
159
- except UnicodeDecodeError:
160
- print('[ERROR] Encoding error in input')
161
- continue
162
- except KeyboardInterrupt:
163
- exit(1)
164
- if message:
165
- return message
166
- print('[ERROR] Query is empty')
167
-
168
- def _chat_stream(model, tokenizer, query, history):
169
- conversation = [
170
- {'role': 'system', 'content': ''},
171
- ]
172
- for query_h, response_h in history:
173
- conversation.append({'role': 'user', 'content': query_h})
174
- conversation.append({'role': 'assistant', 'content': response_h})
175
- conversation.append({'role': 'user', 'content': query})
176
- inputs = tokenizer.apply_chat_template(
177
- conversation,
178
- add_generation_prompt=True,
179
- return_tensors='pt',
180
- )
181
- inputs = inputs.to(model.device)
182
- streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True)
183
- generation_kwargs = dict(
184
- input_ids=inputs,
185
- streamer=streamer,
186
- )
187
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
188
- thread.start()
189
 
190
- for new_text in streamer:
191
- yield new_text
192
 
193
- def main():
194
- checkpoint_path = DEFAULT_CKPT_PATH
195
- seed = random.randint(0, 2**32 - 1) # Generate a random seed
196
- set_seed(seed) # Set the random seed
197
- cpu_only = False
198
 
199
- history = []
200
 
201
- model, tokenizer = _load_model_tokenizer(checkpoint_path, cpu_only)
202
 
203
- while True:
204
- query = _get_input()
205
 
206
- print(f"\nUser: {query}")
207
- print(f"\nAssistant: ", end="")
208
- try:
209
- partial_text = ''
210
- for new_text in _chat_stream(model, tokenizer, query, history):
211
- print(new_text, end='', flush=True)
212
- partial_text += new_text
213
- print()
214
- history.append((query, partial_text))
215
 
216
- except KeyboardInterrupt:
217
- print('Generation interrupted')
218
- continue
219
 
220
- if __name__ == "__main__":
221
- main()
222
  ```
223
 
224
- ## Dataset
225
 
226
- The Qwen2-Boundless model was fine-tuned using a specific dataset named `bad_data.json`, which includes a wide range of text content covering topics related to ethics, law, pornography, and violence. The fine-tuning dataset is entirely in Chinese, so the model performs better in Chinese. If you are interested in exploring or using this dataset, you can find it via the following link:
227
 
228
- - [bad_data.json Dataset](https://huggingface.co/datasets/ystemsrx/Bad_Data_Alpaca)
229
 
230
- And also we used some cybersecurity-related data that was cleaned and organized from [this file](https://github.com/Clouditera/SecGPT/blob/main/secgpt-mini/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%9E%E7%AD%94%E9%9D%A2%E9%97%AE%E9%A2%98-cot.txt).
231
 
232
- ## GitHub Repository
233
 
234
- For more details about the model and ongoing updates, please visit our GitHub repository:
235
 
236
- - [GitHub: ystemsrx/Qwen2-Boundless](https://github.com/ystemsrx/Qwen2-Boundless)
237
 
238
- ## License
239
 
240
- This model and dataset are open-sourced under the Apache 2.0 License.
241
 
242
- ## Disclaimer
243
 
244
- All content provided by this model is for research and testing purposes only. The developers of this model are not responsible for any potential misuse. Users should comply with relevant laws and regulations and are solely responsible for their actions.
 
152
 
153
  return model, tokenizer
154
 
155
+ Def_get_input()->str
156
+ 当为True时:
157
+ 尝试:
158
+ 消息=输入('用户:').strip()
159
+ UnicodeDecodeError除外:
160
+ 打印('[ERROR]输入中的编码错误')
161
+ 继续
162
+ 键盘中断除外:
163
+ 出口(1)
164
+ 如果消息:
165
+ 返回消息
166
+ 打印('[ERROR]查询为空')
167
+
168
+ Def_chat_stream(模型、标记器、查询、历史记录)
169
+ 对话=[
170
+ {'角色''系统''内容':"}
171
+ ]
172
+ 对于历史中的query_hresponse_h
173
+ conversation.append({'role''user''content'query_h})
174
+ conversation.append({'role''assistant''content'response_h})
175
+ conversation.append({'role''user''content'query})
176
+ inputs=tokenizer.apply_chat_template(
177
+ 对话,
178
+ add_generation_prompt=True
179
+ return_tensors='pt'
180
+ )
181
+ inputs=inputs.to(model.device)
182
+ streamer=TextIteratorStreamer(tokenizer=tokenizerskip_prompt=Truetimeout=60.0,skip_special_token=True)
183
+ generation_kwargs=dict(
184
+ input_ids=输入,
185
+ 拖缆=拖缆,
186
+ )
187
+ thread=Thread(target=model.generatekwargs=generation_kwargs)
188
+ Thread.start()
189
 
190
+ 对于拖缆中的新文本(_T):
191
+ 产生新文本(_T)
192
 
193
+ Def main()
194
+ checkpoint_path=DEFAULT_ckpt_PATH
195
+ seed=random.randint(02**32-1)#生成随机种子
196
+ set_seed(种子)#设置随机种子
197
+ CPU_only=False
198
 
199
+ 历史记录=[]
200
 
201
+ modeltokenizer=_load_model_tokenizer(检查点路径,仅cpu)
202
 
203
+ 当为True时:
204
+ query=_get_input()
205
 
206
+ 打印(f“\n用户:{query})
207
+ 打印(f"\n助手:"end="")
208
+ 尝试:
209
+ partial_text="
210
+ 对于聊天流中的新文本(模型、标记器、查询、历史记录)
211
+ 打印(new_textend=",flush=True)
212
+ partial_text+=new_text
213
+ 打印()
214
+ history.append((查询,部分文本))
215
 
216
+ 键盘中断除外:
217
+ 打印(“生成中断”)
218
+ 继续
219
 
220
+ 如果__name__=="__main__"
221
+ 主要的()
222
  ```
223
 
224
+ ##数据集
225
 
226
+ Qwen2-Boundless模型使用名为`bad_data.json`,其中包括广泛的文本内容,涉及伦理、法律、色情和暴力等主题。微调数据集完全是中文的,因此模型的中文性能更好。如果您有兴趣浏览或使用此数据集,可以通过以下链接找到它:
227
 
228
+ - [bad_data.json数据集](https://huggingface.co/datasets/ystemsrx/Bad_Data_Alpaca)
229
 
230
+ 我们还使用了一些与网络安全相关的数据,这些数据是从[此文件](https://github.com/Clouditera/SecGPT/blob/main/secgpt-mini/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%9E%E7%AD%94%E9%9D%A2%E9%97%AE%E9%A2%98-cot.txt).
231
 
232
+ ##GitHub存储库
233
 
234
+ 有关模型和正在进行的更新的更多详细信息,请访问我们的GitHub存储库:
235
 
236
+ - [GitHub:ystemsrx/Qwen2-无界](https://github.com/ystemsrx/Qwen2-Boundless)
237
 
238
+ ##许可证
239
 
240
+ 此模型和数据集在Apache2.0License下是开源的。
241
 
242
+ ##免责声明
243
 
244
+ 本模型提供的所有内容仅供研究和测试之用。此模型的开发人员不对任何潜在的误用负责。用户应遵守相关法律法规,并对其行为负全部责任。