LuozzzzzzzzzzzzzzY commited on
Commit
d7c6a54
·
verified ·
1 Parent(s): e102c69

Delete image-info-train-count.py

Browse files
Files changed (1) hide show
  1. image-info-train-count.py +0 -62
image-info-train-count.py DELETED
@@ -1,62 +0,0 @@
1
- from transformers import (
2
- AutoConfig,
3
- AutoTokenizer,
4
- BitsAndBytesConfig,
5
- AutoProcessor,
6
- LlamaForCausalLM,
7
- MllamaForConditionalGeneration,
8
- AutoModelForCausalLM
9
- )
10
- import torch
11
- from peft import PeftModel
12
- from datasets import load_from_disk
13
- import pandas as pd
14
- from tqdm import tqdm
15
- from torch.utils.data import DataLoader
16
-
17
-
18
- mode_path = '/gemini/pretrain/meta-llamaLlama-3.2-11B-Vision-Instruct'
19
- lora_path = '/gemini/code/FMD/model/final_model_4/checkpoint-2440' # lora 输出对应 checkpoint 路径
20
-
21
- # 加载tokenizer
22
- tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
23
-
24
- # 加载模型
25
- model = MllamaForConditionalGeneration.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
26
-
27
- # 加载lora权重
28
- model = PeftModel.from_pretrained(model, model_id=lora_path)
29
- test_dataset = load_from_disk("/gemini/code/FMD/final_dataset/Test")
30
- results = []
31
- with torch.no_grad():
32
- for data in tqdm(test_dataset):
33
- model_input = tokenizer(
34
- data['instruction_1'], # 输入文本
35
- add_special_tokens=False, # 不添加特殊标记
36
- truncation=True, # 启用截断
37
- max_length=3000 # 设置最大长度
38
- )
39
- model_input = tokenizer.decode(model_input["input_ids"], skip_special_tokens=False)
40
-
41
- model_inputs = tokenizer(f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an expert in financial misinformation detection.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{model_input}\nimage information: {data['image_info']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", truncation=True, max_length=3600, add_special_tokens=False,return_tensors="pt").to('cuda')
42
- # 生成模型输出
43
- generated_ids = model.generate(**model_inputs, max_new_tokens=1024)
44
-
45
- # 去除输入部分的 token,以保留生成的预测结果
46
- generated_ids = [
47
- output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
48
- ]
49
-
50
- # 解码生成的预测结果
51
- responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
52
- print(responses)
53
- # 将每个结果按顺序存储到列表中
54
- results.append({
55
- "ID": data['ID'],
56
- "response": responses
57
- })
58
- if results:
59
- df = pd.DataFrame(results)
60
- output_csv = "/gemini/code/FMD/inference/result_final_model_4/response.csv"
61
- df.to_csv(output_csv, index=False, encoding='utf-8')
62
- print(f"Results saved to {output_csv}")