In [None]:
%%writefile stage1.py
import os
# 安装包的路径,要修改为自己的上传路径
os.system("pip install polars==1.12.0 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install transformers==4.42.4 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install datasets==3.1.0 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install sentence_transformers==3.2.1 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install peft==0.11.1 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install trl==0.8.6 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install accelerate==1.1.1 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")
os.system("pip install einops==0.8.0 --no-index --find-links=/kaggle/input/eedi-llmemb-lib")

os.system("pip install /kaggle/input/flash-attn-lib/flash_attn-2.5.8cu122torch2.3cxx11abiTRUE-cp310-cp310-linux_x86_64.whl")

import sys
sys.path.append("/kaggle/input/utilsfile/")


import os
import copy
from dataclasses import dataclass
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import gc
import ast
from tqdm import tqdm
import json
import shutil
import sys
from glob import glob

import polars as pl
import polars
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.special import softmax

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import Tensor

# from datasets import Dataset, DatasetDict, load_dataset
import transformers
import datasets
import sentence_transformers
from transformers import (
 BitsAndBytesConfig,
 AutoModelForCausalLM,
 AutoModel,
 AutoTokenizer,

 PreTrainedTokenizerFast,
 PreTrainedTokenizerBase, 
 Trainer,
 TrainingArguments,
 DataCollatorWithPadding,
 DataCollatorForSeq2Seq,
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import Accelerator

from utils import seed_torch, current_date_time, get_timediff
from utils import load_yaml, simple_namespace, compare_yaml, format_diffs, write_to_summary_log, init_logger
from utils import zip_files_in_directory, upload_to_kaggle
from utils import mapk

os.environ['TOKENIZERS_PARALLELISM'] = 'true' # 或 'true' 以启用

base_dir = ".."
input_dir = f"{base_dir}/input"
comp_dir = f"{input_dir}/eedi-mining-misconceptions-in-mathematics"
output_dir = f"/kaggle/working/"

last_stage_dir = f"/kaggle/input/12231510-retriever/12231510_retriever" # 需要修改自己的模型路径
adapter_model_path = f"{last_stage_dir}/12231510adapetermodel" # 需要修改自己的模型路径

yaml_path = glob(f"{last_stage_dir}/*.yaml")[0]
cfg = load_yaml(yaml_path)
cfg = simple_namespace(cfg)

seed_torch(cfg.general.seed)
cur_time = current_date_time()
cur_time_abbr = cur_time.replace("-", "").replace(":", "").replace(" ", "")[4:12]
output_dir = f"{output_dir}/{cur_time_abbr}_infer"
os.makedirs(output_dir, exist_ok=True)
LOGGER = init_logger(f'{output_dir}/train.log')
# shutil.copy(yaml_path, yaml_path.replace(last_stage_dir, output_dir))

num_gpus = torch.cuda.device_count()
LOGGER.info(f"可用的 GPU 数量: {num_gpus}")


LOGGER.info(f"polars=={polars.__version__}")
LOGGER.info(f"torch=={torch.__version__}")
LOGGER.info(f"transformers=={transformers.__version__}")
LOGGER.info(f"datasets=={datasets.__version__}")
LOGGER.info(f"sentence_transformers=={sentence_transformers.__version__}")
LOGGER.info(f"")

# %% ================== Read data =======================
train_df = pl.read_csv(f"{comp_dir}/test.csv")
LOGGER.info(f"len(train_df): {len(train_df)}")

misconception_mapping_df = pl.read_csv(f"{comp_dir}/misconception_mapping.csv")
LOGGER.info(f"len(misconception_mapping_df): {len(misconception_mapping_df)}")
LOGGER.info(f"")

# 定义常用的列名列表
common_col = [
 "QuestionId",
 "ConstructName",
 "SubjectName",
 "QuestionText",
 "CorrectAnswer",
]

# 对训练集数据进行处理,转换为长表格式,并添加需要的列
long_df = (
 train_df
 # 选择需要的列,包括common_col和所有的Answer[A-D]Text列
 .select(
 pl.col(common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]])
 )
 # 获取 CorrectAnswer 的 Text,创建新列 CorrectAnswerText
 .with_columns(
 pl.when(pl.col("CorrectAnswer") == "A").then(pl.col("AnswerAText"))
 .when(pl.col("CorrectAnswer") == "B").then(pl.col("AnswerBText"))
 .when(pl.col("CorrectAnswer") == "C").then(pl.col("AnswerCText"))
 .when(pl.col("CorrectAnswer") == "D").then(pl.col("AnswerDText"))
 .otherwise(None)
 .alias("CorrectAnswerText")
 )
 # 使用unpivot函数将宽表转换为长表,将Answer[A-D]Text列展开
 .unpivot(
 index=common_col+["CorrectAnswerText"], # 保持这些列不变
 variable_name="AnswerType", # 展开列的名称存储在新列AnswerType中
 value_name="AnswerText", # 展开列的值存储在新列AnswerText中
 )
 # 添加新列
 .with_columns(
 # 将ConstructName、SubjectName、QuestionText和AnswerText列拼接成一个字符串,存储在AllText列中
 pl.concat_str(
 [
 '### Construct\n' + pl.col("ConstructName"),
 '\n### Subject\n' + pl.col("SubjectName"),
 '\n### Question\n'+ pl.col("QuestionText"),
 '\n### Correct Answer\n' + pl.col("CorrectAnswerText"),
 '\n### Wrong Answer\n' + pl.col("AnswerText"),
 ],
 separator="", ####
 ).alias("AllText"),
 # 从AnswerType列中提取选项字母(A-D),存储在AnswerAlphabet列中
 pl.col("AnswerType").str.extract(r"Answer([A-D])Text$").alias("AnswerAlphabet"),
 )
 # 创建QuestionId_Answer列,将QuestionId和AnswerAlphabet拼接,形成唯一标识
 .with_columns(
 pl.concat_str(
 [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
 ).alias("QuestionId_Answer"),
 )
 # 按照QuestionId_Answer进行排序
 .sort("QuestionId_Answer")
)

LOGGER.info(f"long_df len has nan: {len(long_df)}")


long_df = long_df.to_pandas()
long_df = long_df[long_df["CorrectAnswer"] != long_df["AnswerAlphabet"]].reset_index(drop=True)
long_df

def add_suffix(text, suffix_text, is_query):
 text = f"{suffix_text}{text}"
 text = text.strip()
 if is_query:
 text = f"{text}\n"
 return text

def last_token_pool(last_hidden_states: Tensor,
 attention_mask: Tensor) -> Tensor:
 left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
 if left_padding:
 return last_hidden_states[:, -1]
 else:
 sequence_lengths = attention_mask.sum(dim=1) - 1
 batch_size = last_hidden_states.shape[0]
 return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]



base_model = AutoModel.from_pretrained(
 "/kaggle/input/qwen2.5/transformers/3b-instruct/1",
 torch_dtype=torch.bfloat16 if cfg.training.amp=="bf16" else torch.float16,
 device_map="auto",
 trust_remote_code=True,
 # quantization_config=bnb_config, 
 local_files_only=True,
)

# 加载 finetune 后的 adapter 模型
model = PeftModel.from_pretrained(base_model, adapter_model_path)

# model.eval()

tokenizer = AutoTokenizer.from_pretrained(
 adapter_model_path, 
 trust_remote_code=True
)

start_time = time.time()

model.eval()
batch_size = 4

print(long_df["AllText"].to_list()[0])

query_list = long_df["AllText"].to_list()
query_result = []
for i in tqdm(range(0, len(query_list), batch_size)):
 batch_query_list = query_list[i:i+batch_size]
 batch_query_list = [add_suffix(x, cfg.data.query_prefix, is_query=True) for x in batch_query_list]
 query_encodings = tokenizer(
 batch_query_list,
 padding=True,
 truncation=True,
 return_tensors='pt',
 max_length=cfg.model.query_max_length,
 )
 input_ids = query_encodings['input_ids'].to(model.device)
 attention_mask = query_encodings['attention_mask'].to(model.device)
 with torch.no_grad():
 outputs = model(input_ids=input_ids, attention_mask=attention_mask) 
 embeddings = last_token_pool(outputs.last_hidden_state, attention_mask)
 embeddings = F.normalize(embeddings, p=2, dim=1) # shape: (4370, 4096)
 query_result.append(embeddings)
 torch.cuda.empty_cache()
query_embeddings = torch.cat(query_result, dim=0)

print(batch_query_list[0])


misconception_name = misconception_mapping_df["MisconceptionName"].to_list()
misconception_result = []
for i in tqdm(range(0, len(misconception_name), batch_size)):
 batch_misconception_name = misconception_name[i:i+batch_size]
 batch_misconception_name = [add_suffix(x, cfg.data.mis_prefix, is_query=False) for x in batch_misconception_name]
 misconception_encodings = tokenizer(
 batch_misconception_name,
 padding=True,
 truncation=True,
 return_tensors='pt',
 max_length=cfg.model.mis_max_length,
 )
 input_ids = misconception_encodings['input_ids'].to(model.device)
 attention_mask = misconception_encodings['attention_mask'].to(model.device)
 with torch.no_grad():
 outputs = model(input_ids=input_ids, attention_mask=attention_mask) 
 embeddings = last_token_pool(outputs.last_hidden_state, attention_mask)
 embeddings = F.normalize(embeddings, p=2, dim=1) # shape: (2587, 4096)
 misconception_result.append(embeddings)
 torch.cuda.empty_cache()
misconception_embeddings = torch.cat(misconception_result, dim=0)


print(batch_misconception_name[0])

scores = (query_embeddings @ misconception_embeddings.T) * 100 # shape: (4370, 2587)
scores = scores.float()
scores = scores.cpu().numpy()
LOGGER.info(f"{scores.shape = }")

# 获取误解id的index,按照score排序
preds_all_mm_ids = np.argsort(-scores, axis=1)
preds_top25_mm_ids = preds_all_mm_ids[:, :25]
long_df["MisconceptionId"] = preds_top25_mm_ids.tolist()
long_df["MisconceptionId"] = long_df["MisconceptionId"].apply(lambda x: " ".join(map(str, x)))

long_df.to_parquet("test.pq")
long_df[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission2.csv", index=False)

In [None]:
!python stage1.py

In [None]:
import os
import sys
sys.path.append("/kaggle/input/utilsfile/")


import os
import copy
from dataclasses import dataclass
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import gc
import ast
from tqdm import tqdm
import json
import shutil
import sys
from glob import glob

import polars as pl
import polars
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.special import softmax

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import Tensor

# from datasets import Dataset, DatasetDict, load_dataset
import transformers
import datasets
import sentence_transformers
from transformers import (
 BitsAndBytesConfig,
 AutoModelForCausalLM,
 AutoModel,
 AutoTokenizer,

 PreTrainedTokenizerFast,
 PreTrainedTokenizerBase, 
 Trainer,
 TrainingArguments,
 DataCollatorWithPadding,
 DataCollatorForSeq2Seq,
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import Accelerator


from utils import seed_torch, current_date_time, get_timediff
from utils import load_yaml, simple_namespace, compare_yaml, format_diffs, write_to_summary_log, init_logger
from utils import zip_files_in_directory, upload_to_kaggle
from utils import mapk

os.environ['TOKENIZERS_PARALLELISM'] = 'true' 

base_dir = ".."
input_dir = f"{base_dir}/input"
comp_dir = f"{input_dir}/eedi-mining-misconceptions-in-mathematics"
output_dir = f"/kaggle/working/"

last_stage_dir = f"/kaggle/input/000000-reranker/000000_reranker" # 需要修改自己的模型路径
adapter_model_path = f"{last_stage_dir}/checkpoint-874" # 需要修改自己的模型路径

yaml_path = glob(f"{last_stage_dir}/*.yaml")[0]
cfg = load_yaml(yaml_path)
cfg = simple_namespace(cfg)

seed_torch(cfg.general.seed)
cur_time = current_date_time()
cur_time_abbr = cur_time.replace("-", "").replace(":", "").replace(" ", "")[4:12]
output_dir = f"{output_dir}/{cur_time_abbr}_infer_stage2"
os.makedirs(output_dir, exist_ok=True)
LOGGER = init_logger(f'{output_dir}/infer_stage2.log')
# shutil.copy(yaml_path, yaml_path.replace(last_stage_dir, output_dir))

num_gpus = torch.cuda.device_count()
LOGGER.info(f"可用的 GPU 数量: {num_gpus}")


LOGGER.info(f"polars=={polars.__version__}")
LOGGER.info(f"torch=={torch.__version__}")
LOGGER.info(f"transformers=={transformers.__version__}")
LOGGER.info(f"datasets=={datasets.__version__}")
LOGGER.info(f"sentence_transformers=={sentence_transformers.__version__}")
LOGGER.info(f"")

In [None]:
misconception_mapping_df = pl.read_csv(f"{comp_dir}/misconception_mapping.csv")
misconception_name = misconception_mapping_df["MisconceptionName"].to_list()
misconception_dict = misconception_mapping_df.to_pandas().set_index('MisconceptionId')['MisconceptionName'].to_dict()
LOGGER.info(f"len(misconception_mapping_df): {len(misconception_mapping_df)}")
LOGGER.info(f"")

test_df = pd.read_parquet("test.pq")[["QuestionId_Answer", "AllText", "MisconceptionId"]]
test_df["preds_all_mm_ids"] = test_df["MisconceptionId"].apply(lambda x: x.split(" "))
test_df["preds_all_mm_ids"] = test_df["preds_all_mm_ids"].apply(lambda x_list: [int(x) for x in x_list])
def data_preprocess(df, is_train):
 df["top_mm_ids"] = df["preds_all_mm_ids"].apply(lambda x: x[:5])

 # 对 top_mm_ids 的顺序洗牌
 df["top_mm_ids"] = df["top_mm_ids"].apply(lambda x: np.random.permutation(x).tolist())

 # 新建一列 top_mm_texts, 也是一个列表, 其中的值是top_mm_ids对应于misconception_dict中的value
 df["top_mm_texts"] = df["top_mm_ids"].apply(lambda ids: [misconception_dict[id] for id in ids])

 # 在 AllText 后面加上新"\n\nHere are 5 possible candidates for misconception:\n"
 df["AllText"] = df["AllText"] + "\n\nHere are 5 possible candidates for misconception:\n"

 # 在 AllText 后面加上5个候选项,候选项来自top_mm_texts, 然后要这样的格式 "A. candidate0\nB. candidate1\nC. candidate2\nD. candidate3\nE. candidate4"
 df["AllText"] = df.apply(
 lambda row: row["AllText"] + "\n".join([f"{chr(65+i)}. {candidate}" for i, candidate in enumerate(row["top_mm_texts"])]),
 axis=1
 )

 # 在 AllText 后面加上新"\nWhich misconception candidate best explains what led to the wrong answer? (Please directly answer A, B, C, D or E)"
 df["AllText"] = df["AllText"] + "\nWhich misconception candidate best explains what led to the wrong answer? (Please directly answer A, B, C, D or E)\nAnswer:"

 return df

test_df = data_preprocess(test_df, is_train=True)

In [None]:
print(f"adapter_model_path: {adapter_model_path}")

base_model = AutoModelForCausalLM.from_pretrained(
 "/kaggle/input/qwen2.5/transformers/3b-instruct/1",
 torch_dtype=torch.bfloat16 if cfg.training.amp=="bf16" else torch.float16,
 device_map="auto",
 trust_remote_code=True,
 # quantization_config=bnb_config, 
 local_files_only=True,
)

# 加载 finetune 后的 adapter 模型
model = PeftModel.from_pretrained(base_model, adapter_model_path)

tokenizer = AutoTokenizer.from_pretrained(
 adapter_model_path, 
 trust_remote_code=True
)

model.eval()

In [None]:
PROMPT = """<|im_start|>system
Given a math question and its incorrect answer, identify the underlying misconception that led to the mistake.<|im_end|>
<|im_start|>user
{AllText}<|im_end|>
<|im_start|>assistant
"""

def apply_template(row):
 instruction_text = PROMPT.format(
 AllText=row["AllText"],
 )
 return instruction_text

test_df[["instruction"]] = test_df.apply(lambda row: pd.Series(apply_template(row)), axis=1)
test_df["instruction_token_len"] = test_df["instruction"].apply(lambda x: len(tokenizer(x)["input_ids"]))
LOGGER.info(f"test instruction_token_len range: {test_df['instruction_token_len'].min()} ~ {test_df['instruction_token_len'].max()}")

In [None]:
print(test_df["instruction"][0])

In [None]:
l2i_dict = {"A":0, "B":1, "C":2, "D":3, "E":4}

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
 top_mm_ids = row['top_mm_ids']
 
 inputs = tokenizer(
 row['instruction'], 
 # padding=True,
 truncation=True, 
 return_tensors="pt",
 max_length=768, 
 )

 input_ids = inputs['input_ids'].to(model.device)
 attention_mask = inputs['attention_mask'].to(model.device)

 with torch.no_grad():
 output = model(input_ids=input_ids, attention_mask=attention_mask)
 
 output = output.logits
 # 找出 attention_mask 中非零元素的索引,因为 attention_mask 中的非零元素对应的是输入的有效 token
 non_zero_indices = torch.nonzero(inputs['attention_mask'])
 # 获取最后一个有效 token 的索引
 last_one_index = non_zero_indices[-1][1].item()
 # 获取最后一个有效 token 对应的 logits
 first_token_probs = output[0][last_one_index]

 # 对每一个选项(' A',' B',' C',' D',' E')进行处理
 # tokenizer(option).input_ids[-1] 是获取选项对应的 id
 # first_token_probs[id] 是获取该 id 对应的 logits
 # 然后将 logits 和选项一起作为一个元组存储在 options_list 中
 options_list = [
 (first_token_probs[tokenizer('A').input_ids[-1]], 'A'),
 (first_token_probs[tokenizer('B').input_ids[-1]], 'B'),
 (first_token_probs[tokenizer('C').input_ids[-1]], 'C'),
 (first_token_probs[tokenizer('D').input_ids[-1]], 'D'),
 (first_token_probs[tokenizer('E').input_ids[-1]], 'E'),
 ]
 # 对 options_list 进行排序,排序的依据是 logits,也就是每个选项的概率
 options_list = sorted(options_list, reverse=True)
 
 top5_index = [l2i_dict[x[1]] for x in options_list]
 sorted_ids = [top_mm_ids[idx] for idx in top5_index]
 
 row['preds_all_mm_ids'][:5] = sorted_ids

In [None]:
# 保存 submission.csv
test_df["MisconceptionId"] = test_df["preds_all_mm_ids"].apply(lambda x: " ".join(map(str, x))) # 列表转字符串
test_df[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission.csv", index=False)
test_df