Spaces:
Runtime error
Runtime error
Commit
·
d0d5660
1
Parent(s):
3776314
update scripts
Browse files- src/backend/model_operations.py +383 -28
src/backend/model_operations.py
CHANGED
|
@@ -35,7 +35,7 @@ import spacy_transformers
|
|
| 35 |
import subprocess
|
| 36 |
|
| 37 |
# Run the command to download the spaCy model
|
| 38 |
-
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
|
| 39 |
# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
|
| 40 |
# subprocess.run(["pip", "install", "spacy-transformers"], check=True)
|
| 41 |
# subprocess.run(["pip", "install", "curated-transformers"], check=True)
|
|
@@ -43,7 +43,7 @@ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=Tr
|
|
| 43 |
# Load spacy model for word tokenization
|
| 44 |
# nlp = spacy.load("en_core_web_sm")
|
| 45 |
try:
|
| 46 |
-
nlp1 = spacy.load("
|
| 47 |
except OSError:
|
| 48 |
print("Can not load spacy model")
|
| 49 |
|
|
@@ -171,7 +171,8 @@ class ResponseGenerator:
|
|
| 171 |
# print(ID, q_ID, prompt_value)
|
| 172 |
system_prompt = envs.SYSTEM_PROMPT
|
| 173 |
_user_prompt = prompt_value
|
| 174 |
-
|
|
|
|
| 175 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
| 176 |
while True:
|
| 177 |
try:
|
|
@@ -179,6 +180,7 @@ class ResponseGenerator:
|
|
| 179 |
print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
|
| 180 |
|
| 181 |
_response = self.send_request(system_prompt, _user_prompt)
|
|
|
|
| 182 |
# print(f"Finish index {index}")
|
| 183 |
break
|
| 184 |
except Exception as e:
|
|
@@ -205,6 +207,7 @@ class ResponseGenerator:
|
|
| 205 |
time.sleep(wait_time)
|
| 206 |
try:
|
| 207 |
_response = self.send_request(system_prompt, _user_prompt)
|
|
|
|
| 208 |
break
|
| 209 |
except Exception as ee:
|
| 210 |
exceptions.append(ee)
|
|
@@ -512,7 +515,7 @@ class EvaluationModel:
|
|
| 512 |
self.scores = []
|
| 513 |
self.humanlike_score = None
|
| 514 |
|
| 515 |
-
def
|
| 516 |
'''code results from LLM's response'''
|
| 517 |
output = []
|
| 518 |
'''database for Exp4'''
|
|
@@ -738,25 +741,365 @@ class EvaluationModel:
|
|
| 738 |
doc = nlp1(sentence)
|
| 739 |
subject = "None"
|
| 740 |
obj = "None"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
|
| 743 |
for token in doc:
|
| 744 |
if token.dep_ == "nsubj":
|
| 745 |
subject = token.text
|
| 746 |
elif token.dep_ == "dobj":
|
| 747 |
obj = token.text
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
|
|
|
|
|
|
| 751 |
output.append("Other")
|
| 752 |
-
elif subject in
|
| 753 |
-
#print(rs, subject, obj, "VP")
|
| 754 |
output.append("VP")
|
| 755 |
-
elif obj in
|
| 756 |
-
#print(rs, subject, obj, "NP")
|
| 757 |
output.append("NP")
|
| 758 |
else:
|
| 759 |
-
#print(rs, subject, obj, "Other")
|
| 760 |
output.append("Other")
|
| 761 |
|
| 762 |
'''Exp7'''
|
|
@@ -834,11 +1177,22 @@ class EvaluationModel:
|
|
| 834 |
# exit()
|
| 835 |
'''LLM'''
|
| 836 |
print(len(output))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
self.data = pd.DataFrame(list(
|
| 838 |
-
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
|
| 839 |
-
|
| 840 |
-
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Stimuli 1",
|
| 841 |
-
"Coding"])
|
| 842 |
|
| 843 |
return self.data
|
| 844 |
|
|
@@ -848,6 +1202,8 @@ class EvaluationModel:
|
|
| 848 |
|
| 849 |
|
| 850 |
|
|
|
|
|
|
|
| 851 |
def calculate_js_divergence(self, file_path_1, file_path_2):
|
| 852 |
"""
|
| 853 |
Calculate the Jensen-Shannon divergence for response distributions between two datasets.
|
|
@@ -855,7 +1211,7 @@ class EvaluationModel:
|
|
| 855 |
removes the original E5 and E51, and then calculates the JS divergence between the datasets.
|
| 856 |
|
| 857 |
Parameters:
|
| 858 |
-
file_path_1 (str): Path to the first dataset file (
|
| 859 |
file_path_2 (str): Path to the second dataset file (CSV format).
|
| 860 |
|
| 861 |
Returns:
|
|
@@ -893,17 +1249,15 @@ class EvaluationModel:
|
|
| 893 |
human_df = pd.concat([human_df, human_e5], ignore_index=True)
|
| 894 |
llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
|
| 895 |
|
| 896 |
-
|
| 897 |
### Calculate Average JS Divergence ###
|
| 898 |
|
| 899 |
-
|
| 900 |
# Extract the relevant columns for JS divergence calculation
|
| 901 |
human_responses = human_df[['Question_ID', 'Coding']]
|
| 902 |
llm_responses = llm_df[['Question_ID', 'Coding']]
|
| 903 |
|
| 904 |
# Remove 'Other' responses
|
| 905 |
-
human_responses = human_responses[human_responses['Coding'] != 'Other']
|
| 906 |
-
llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
|
| 907 |
|
| 908 |
# Get unique Question_IDs present in both datasets
|
| 909 |
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
|
@@ -933,6 +1287,7 @@ class EvaluationModel:
|
|
| 933 |
|
| 934 |
# Calculate the average JS divergence per experiment and the confidence interval
|
| 935 |
results = {}
|
|
|
|
| 936 |
for exp, divs in js_divergence.items():
|
| 937 |
avg_js_divergence = 1 - np.nanmean(divs)
|
| 938 |
ci_lower, ci_upper = bootstrap((divs,), np.nanmean, confidence_level=0.95,
|
|
@@ -941,14 +1296,14 @@ class EvaluationModel:
|
|
| 941 |
'average_js_divergence': avg_js_divergence,
|
| 942 |
'confidence_interval': (1 - ci_upper, 1 - ci_lower) # Adjust for 1 - score
|
| 943 |
}
|
|
|
|
| 944 |
|
| 945 |
-
# Calculate the
|
| 946 |
-
|
| 947 |
-
flattened_js_divergence = np.concatenate([np.array(divs) for divs in js_divergence.values()])
|
| 948 |
|
| 949 |
-
#
|
| 950 |
overall_ci_lower, overall_ci_upper = bootstrap(
|
| 951 |
-
(
|
| 952 |
np.nanmean,
|
| 953 |
confidence_level=0.95,
|
| 954 |
n_resamples=1000
|
|
@@ -957,8 +1312,8 @@ class EvaluationModel:
|
|
| 957 |
# Combine all results into one dictionary
|
| 958 |
all_results = {
|
| 959 |
'overall': {
|
| 960 |
-
'average_js_divergence':
|
| 961 |
-
'confidence_interval': (
|
| 962 |
},
|
| 963 |
'per_experiment': results
|
| 964 |
}
|
|
|
|
| 35 |
import subprocess
|
| 36 |
|
| 37 |
# Run the command to download the spaCy model
|
| 38 |
+
# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
|
| 39 |
# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
|
| 40 |
# subprocess.run(["pip", "install", "spacy-transformers"], check=True)
|
| 41 |
# subprocess.run(["pip", "install", "curated-transformers"], check=True)
|
|
|
|
| 43 |
# Load spacy model for word tokenization
|
| 44 |
# nlp = spacy.load("en_core_web_sm")
|
| 45 |
try:
|
| 46 |
+
nlp1 = spacy.load("en_core_web_sm")
|
| 47 |
except OSError:
|
| 48 |
print("Can not load spacy model")
|
| 49 |
|
|
|
|
| 171 |
# print(ID, q_ID, prompt_value)
|
| 172 |
system_prompt = envs.SYSTEM_PROMPT
|
| 173 |
_user_prompt = prompt_value
|
| 174 |
+
print(_user_prompt)
|
| 175 |
+
for ii in range(100):
|
| 176 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
| 177 |
while True:
|
| 178 |
try:
|
|
|
|
| 180 |
print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
|
| 181 |
|
| 182 |
_response = self.send_request(system_prompt, _user_prompt)
|
| 183 |
+
# print(_response)
|
| 184 |
# print(f"Finish index {index}")
|
| 185 |
break
|
| 186 |
except Exception as e:
|
|
|
|
| 207 |
time.sleep(wait_time)
|
| 208 |
try:
|
| 209 |
_response = self.send_request(system_prompt, _user_prompt)
|
| 210 |
+
|
| 211 |
break
|
| 212 |
except Exception as ee:
|
| 213 |
exceptions.append(ee)
|
|
|
|
| 515 |
self.scores = []
|
| 516 |
self.humanlike_score = None
|
| 517 |
|
| 518 |
+
def code_results_llm_cleaned(self, responses_df):
|
| 519 |
'''code results from LLM's response'''
|
| 520 |
output = []
|
| 521 |
'''database for Exp4'''
|
|
|
|
| 741 |
doc = nlp1(sentence)
|
| 742 |
subject = "None"
|
| 743 |
obj = "None"
|
| 744 |
+
pobj_list = [] # To collect all prepositional objects
|
| 745 |
+
|
| 746 |
+
for token in doc:
|
| 747 |
+
if token.dep_ == "nsubj":
|
| 748 |
+
subject = token.text
|
| 749 |
+
elif token.dep_ == "dobj":
|
| 750 |
+
obj = token.text
|
| 751 |
+
elif token.dep_ == "pobj":
|
| 752 |
+
pobj_list.append(token.text) # Collect prepositional objects
|
| 753 |
+
|
| 754 |
+
rs_list = rs.lower().split()
|
| 755 |
+
if subject in rs_list and (obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list)):
|
| 756 |
+
output.append("Other")
|
| 757 |
+
elif subject in rs_list:
|
| 758 |
+
output.append("VP")
|
| 759 |
+
elif obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list):
|
| 760 |
+
output.append("NP")
|
| 761 |
+
else:
|
| 762 |
+
output.append("Other")
|
| 763 |
+
|
| 764 |
+
'''Exp7'''
|
| 765 |
+
elif responses_df["Experiment"][i] == "E7":
|
| 766 |
+
# rs = responses_df["Response"][i].strip().lower()
|
| 767 |
+
rs = rs.replace(".", "").replace(",", "").lower()
|
| 768 |
+
#print("E7", rs)
|
| 769 |
+
if "yes" in rs and "no" in rs:
|
| 770 |
+
output.append("Other")
|
| 771 |
+
elif "no" in rs:
|
| 772 |
+
output.append("0")
|
| 773 |
+
elif "yes" in rs:
|
| 774 |
+
output.append("1")
|
| 775 |
+
else:
|
| 776 |
+
output.append("Other")
|
| 777 |
+
|
| 778 |
+
'''Exp8'''
|
| 779 |
+
elif responses_df["Experiment"][i] == "E8":
|
| 780 |
+
# rs = responses_df["Response"][i].strip()
|
| 781 |
+
#print("E8", rs)
|
| 782 |
+
if "something is wrong with the question" in rs:
|
| 783 |
+
output.append("1")
|
| 784 |
+
else:
|
| 785 |
+
output.append("0")
|
| 786 |
+
|
| 787 |
+
'''Exp9'''
|
| 788 |
+
elif responses_df["Experiment"][i] == "E9":
|
| 789 |
+
male, female = 0, 0
|
| 790 |
+
|
| 791 |
+
# rs = responses_df["Response"][i].strip()
|
| 792 |
+
if "because" in rs:
|
| 793 |
+
rs = rs.replace("because because", "because").split("because")[1]
|
| 794 |
+
else:
|
| 795 |
+
rs = rs
|
| 796 |
+
condition = responses_df["Factor 2"][i].strip()
|
| 797 |
+
rs = rs.split(" ")
|
| 798 |
+
for w in rs:
|
| 799 |
+
if w in male_keyword and female != 1:
|
| 800 |
+
male = 1
|
| 801 |
+
break
|
| 802 |
+
if w in female_keyword and male != 1:
|
| 803 |
+
female = 1
|
| 804 |
+
break
|
| 805 |
+
#print("E9", "condition", condition, "male", male, "female", female)
|
| 806 |
+
if male == 0 and female == 0:
|
| 807 |
+
output.append('Other')
|
| 808 |
+
else:
|
| 809 |
+
if male == 1 and female == 0:
|
| 810 |
+
if condition == "MF":
|
| 811 |
+
output.append("Subject")
|
| 812 |
+
elif condition == "FM":
|
| 813 |
+
output.append("Object")
|
| 814 |
+
else:
|
| 815 |
+
output.append("Other")
|
| 816 |
+
elif female == 1 and male == 0:
|
| 817 |
+
if condition == "MF":
|
| 818 |
+
output.append("Object")
|
| 819 |
+
elif condition == "FM":
|
| 820 |
+
output.append("Subject")
|
| 821 |
+
else:
|
| 822 |
+
output.append("Other")
|
| 823 |
+
|
| 824 |
+
'''Exp10'''
|
| 825 |
+
elif responses_df["Experiment"][i] == "E10":
|
| 826 |
+
# rs = responses_df["Response"][i].strip()
|
| 827 |
+
rs = rs.replace(".", "")
|
| 828 |
+
if rs == "yes":
|
| 829 |
+
output.append("1")
|
| 830 |
+
else:
|
| 831 |
+
output.append("0")
|
| 832 |
+
else:
|
| 833 |
+
#print("can;t find the Exp:", responses_df["Experiment"][i])
|
| 834 |
+
output.append("NA")
|
| 835 |
+
# print(output)
|
| 836 |
+
# exit()
|
| 837 |
+
'''LLM'''
|
| 838 |
+
print(len(output))
|
| 839 |
+
import re
|
| 840 |
+
def clean_text(text):
|
| 841 |
+
if isinstance(text, str):
|
| 842 |
+
return re.sub(r'[^\x00-\x7F]+', '', text)
|
| 843 |
+
return text
|
| 844 |
+
|
| 845 |
+
responses_df["Experiment"] = responses_df["Experiment"].apply(clean_text)
|
| 846 |
+
responses_df["Question_ID"] = responses_df["Question_ID"].apply(clean_text)
|
| 847 |
+
responses_df["Item"] = responses_df["Item"].apply(clean_text)
|
| 848 |
+
responses_df["Response"] = responses_df["Response"].apply(clean_text)
|
| 849 |
+
|
| 850 |
+
output = [str(item) for item in output]
|
| 851 |
+
|
| 852 |
+
self.data = pd.DataFrame(list(
|
| 853 |
+
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],output)),
|
| 854 |
+
columns=["Experiment", "Question_ID", "Item", "Response","Coding"])
|
| 855 |
+
|
| 856 |
+
return self.data
|
| 857 |
+
|
| 858 |
+
def code_results_llm(self, responses_df):
|
| 859 |
+
'''code results from LLM's response'''
|
| 860 |
+
output = []
|
| 861 |
+
'''database for Exp4'''
|
| 862 |
+
item4 = pd.read_csv(envs.ITEM_4_DATA)
|
| 863 |
+
wordpair2code = {}
|
| 864 |
+
for j in range(len(item4['Coding'])):
|
| 865 |
+
wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
|
| 866 |
+
'''verb for Exp5'''
|
| 867 |
+
item5 = pd.read_csv(envs.ITEM_5_DATA)
|
| 868 |
+
# item corresponding to verb, same item id corresponding to verb pair
|
| 869 |
+
item2verb2 = {}
|
| 870 |
+
item2verb1 = {}
|
| 871 |
+
|
| 872 |
+
Stimuli1, Stimuli2 = {}, {}
|
| 873 |
+
for j in range(len(item5['Item'])):
|
| 874 |
+
item2verb1[item5['Item'][j]] = item5['Verb1'][j]
|
| 875 |
+
item2verb2[item5['Item'][j]] = item5['Verb2'][j]
|
| 876 |
+
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
| 877 |
+
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
| 878 |
+
|
| 879 |
+
male_keyword = ["he", "his", "himself"]
|
| 880 |
+
female_keyword = ["she", "her", "herself"]
|
| 881 |
+
#print(len(responses_df["Experiment"]))
|
| 882 |
+
for i in range(len(responses_df["Experiment"])):
|
| 883 |
+
|
| 884 |
+
|
| 885 |
+
print(i, "/", len(responses_df["Experiment"]))
|
| 886 |
+
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
| 887 |
+
# print()
|
| 888 |
+
if pd.isna(responses_df["Response"][i]):
|
| 889 |
+
output.append("Other")
|
| 890 |
+
continue
|
| 891 |
+
rs = responses_df["Response"][i].strip().lower()
|
| 892 |
+
print(rs)
|
| 893 |
+
rs = rs.replace('"', '').replace(" ", " ").replace('.', '')
|
| 894 |
+
#lines = rs.split("\n")
|
| 895 |
+
#filtered_lines = [line for line in lines if line and not (line.endswith(":") or line.endswith(":"))]
|
| 896 |
+
# filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for
|
| 897 |
+
# r in filtered_lines]
|
| 898 |
+
# rs = "\n".join(filtered_lines)
|
| 899 |
+
# rs = rs.strip()
|
| 900 |
+
'''Exp1'''
|
| 901 |
+
if responses_df["Experiment"][i] == "E1":
|
| 902 |
+
rs_lower = rs.lower()
|
| 903 |
+
if "round" in rs_lower and "spiky" in rs_lower:
|
| 904 |
+
output.append("Other")
|
| 905 |
+
elif "round" in rs_lower:
|
| 906 |
+
output.append("Round")
|
| 907 |
+
elif "spiky" in rs_lower:
|
| 908 |
+
output.append("Spiky")
|
| 909 |
+
else:
|
| 910 |
+
output.append("Other")
|
| 911 |
+
|
| 912 |
+
'''Exp2'''
|
| 913 |
+
|
| 914 |
+
elif responses_df["Experiment"][i] == "E2":
|
| 915 |
+
# rs = responses_df["Response"][i].strip()
|
| 916 |
+
rs = rs.split(' ')
|
| 917 |
+
#print("E2", rs)
|
| 918 |
+
male, female = 0, 0
|
| 919 |
+
for word in rs:
|
| 920 |
+
if word in female_keyword and male == 0:
|
| 921 |
+
female = 1
|
| 922 |
+
output.append("Female")
|
| 923 |
+
break
|
| 924 |
+
if word in male_keyword and female == 0:
|
| 925 |
+
male = 1
|
| 926 |
+
output.append("Male")
|
| 927 |
+
break
|
| 928 |
+
if male == 0 and female == 0:
|
| 929 |
+
output.append("Other")
|
| 930 |
|
| 931 |
+
'''Exp3'''
|
| 932 |
+
elif responses_df["Experiment"][i] == "E3":
|
| 933 |
+
# rs = responses_df["Response"][i].strip()
|
| 934 |
+
#print("E3", rs)
|
| 935 |
+
pair = responses_df["Factor 2"][i]
|
| 936 |
+
word1, word2 = pair.replace(".", "").split('_')
|
| 937 |
+
|
| 938 |
+
if responses_df["Item"][i] == 12:
|
| 939 |
+
output.append("Other")
|
| 940 |
+
else:
|
| 941 |
+
words = rs.split() # split the response into words
|
| 942 |
+
if any(word == word1 for word in words) and any(word == word2 for word in words):
|
| 943 |
+
output.append("Other")
|
| 944 |
+
else:
|
| 945 |
+
if any(word.lower() == word1.lower() for word in words):
|
| 946 |
+
if len(word1) > len(word2):
|
| 947 |
+
output.append("Long")
|
| 948 |
+
else:
|
| 949 |
+
output.append("Short")
|
| 950 |
+
elif any(word.lower() == word2.lower() for word in words):
|
| 951 |
+
if len(word1) > len(word2):
|
| 952 |
+
output.append("Short")
|
| 953 |
+
else:
|
| 954 |
+
output.append("Long")
|
| 955 |
+
else:
|
| 956 |
+
if len(words) > 1:
|
| 957 |
+
# joint the words using " "
|
| 958 |
+
word = " ".join(words)
|
| 959 |
+
if word.lower() == word1.lower():
|
| 960 |
+
if len(word1) > len(word2):
|
| 961 |
+
output.append("Long")
|
| 962 |
+
else:
|
| 963 |
+
output.append("Short")
|
| 964 |
+
elif word.lower() == word2.lower():
|
| 965 |
+
if len(word1) > len(word2):
|
| 966 |
+
output.append("Short")
|
| 967 |
+
else:
|
| 968 |
+
output.append("Long")
|
| 969 |
+
else:
|
| 970 |
+
output.append("Other")
|
| 971 |
+
else:
|
| 972 |
+
output.append("Other")
|
| 973 |
+
|
| 974 |
+
|
| 975 |
+
'''Exp4'''
|
| 976 |
+
|
| 977 |
+
elif responses_df["Experiment"][i] == "E4":
|
| 978 |
+
lines = rs.split("\n")
|
| 979 |
+
filtered_lines = []
|
| 980 |
+
if len(lines) > 1:
|
| 981 |
+
for r in lines[1:]:
|
| 982 |
+
if ':' in r:
|
| 983 |
+
filtered_lines.append(r.split(':', 1)[-1].strip())
|
| 984 |
+
else:
|
| 985 |
+
filtered_lines.append(r)
|
| 986 |
+
filtered_lines.insert(0, lines[0])
|
| 987 |
+
else:
|
| 988 |
+
filtered_lines = lines
|
| 989 |
+
# print(filtered_lines)
|
| 990 |
+
|
| 991 |
+
#filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
| 992 |
+
#rs = "\n".join(filtered_lines)
|
| 993 |
+
|
| 994 |
+
#filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split(";")]
|
| 995 |
+
#filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
| 996 |
+
rs = ";".join(filtered_lines).strip()
|
| 997 |
+
try:
|
| 998 |
+
meaning_word = rs.split(";")[4].replace(" ", '')
|
| 999 |
+
except IndexError:
|
| 1000 |
+
try:
|
| 1001 |
+
meaning_word = rs.split("\n")[4].replace(" ", '')
|
| 1002 |
+
except IndexError:
|
| 1003 |
+
output.append("Other")
|
| 1004 |
+
continue
|
| 1005 |
+
except Exception as e:
|
| 1006 |
+
print(f"Unexpected error: {e}")
|
| 1007 |
+
output.append("Other")
|
| 1008 |
+
continue
|
| 1009 |
+
|
| 1010 |
+
target = responses_df["Factor 2"][i].strip().lower()
|
| 1011 |
+
pair = target + "_" + meaning_word
|
| 1012 |
+
#print("E4:", pair)
|
| 1013 |
+
|
| 1014 |
+
if pair in wordpair2code.keys():
|
| 1015 |
+
output.append(wordpair2code[pair])
|
| 1016 |
+
else:
|
| 1017 |
+
output.append("Other")
|
| 1018 |
+
|
| 1019 |
+
'''Exp5'''
|
| 1020 |
+
elif responses_df["Experiment"][i] == "E5" or responses_df["Experiment"][i] == "E51":
|
| 1021 |
+
# sentence = responses_df["Response"][i].strip()
|
| 1022 |
+
item_id = responses_df["Item"][i]
|
| 1023 |
+
question_id = responses_df["Question_ID"][i]
|
| 1024 |
+
|
| 1025 |
+
if responses_df["Experiment"][i] == "E51":
|
| 1026 |
+
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
| 1027 |
+
#sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
| 1028 |
+
verb = item2verb1[item_id].lower()
|
| 1029 |
+
|
| 1030 |
+
sentence = sti1 + " " + rs.replace(sti1, "")
|
| 1031 |
+
#print("E5", verb, sentence)
|
| 1032 |
+
if responses_df["Experiment"][i] == "E5":
|
| 1033 |
+
#sti1 = Stimuli1[question_id].lower().replace("...", "")
|
| 1034 |
+
# print(sti1)
|
| 1035 |
+
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
| 1036 |
+
|
| 1037 |
+
verb = item2verb2[item_id].lower()
|
| 1038 |
+
sentence = sti2 + " " + rs.replace(sti2, "")
|
| 1039 |
+
#print("E5", verb, sentence)
|
| 1040 |
+
|
| 1041 |
+
doc = nlp1(sentence.replace(" ", " "))
|
| 1042 |
+
# print(doc)
|
| 1043 |
+
# print()
|
| 1044 |
+
verb_token = None
|
| 1045 |
+
for token in doc:
|
| 1046 |
+
# print(token.lemma_)
|
| 1047 |
+
if token.lemma_ == verb:
|
| 1048 |
+
verb_token = token
|
| 1049 |
+
break
|
| 1050 |
+
# exit()
|
| 1051 |
+
pobj, dative = None, None
|
| 1052 |
+
# print(verb_token.children)
|
| 1053 |
+
# exit()
|
| 1054 |
+
if verb_token is not None:
|
| 1055 |
+
for child in verb_token.children:
|
| 1056 |
+
# print(child)
|
| 1057 |
+
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (
|
| 1058 |
+
child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
|
| 1059 |
+
pobj = child.text
|
| 1060 |
+
if child.dep_ == 'dative':
|
| 1061 |
+
dative = child.text
|
| 1062 |
+
|
| 1063 |
+
# print("E5", pobj, dative)
|
| 1064 |
+
# exit()
|
| 1065 |
+
|
| 1066 |
+
if pobj:
|
| 1067 |
+
output.append("PO")
|
| 1068 |
+
elif dative:
|
| 1069 |
+
output.append("DO")
|
| 1070 |
+
else:
|
| 1071 |
+
# print("Other", sentence, pobj, dative)
|
| 1072 |
+
# exit()
|
| 1073 |
+
output.append("Other")
|
| 1074 |
+
|
| 1075 |
+
|
| 1076 |
+
|
| 1077 |
+
'''Exp6'''
|
| 1078 |
+
|
| 1079 |
+
elif responses_df["Experiment"][i] == "E6":
|
| 1080 |
+
sentence = responses_df["Stimuli 1"][i].strip().lower()
|
| 1081 |
+
#print("E6", sentence)
|
| 1082 |
+
doc = nlp1(sentence)
|
| 1083 |
+
subject = "None"
|
| 1084 |
+
obj = "None"
|
| 1085 |
+
pobj_list = [] # To collect all prepositional objects
|
| 1086 |
|
| 1087 |
for token in doc:
|
| 1088 |
if token.dep_ == "nsubj":
|
| 1089 |
subject = token.text
|
| 1090 |
elif token.dep_ == "dobj":
|
| 1091 |
obj = token.text
|
| 1092 |
+
elif token.dep_ == "pobj":
|
| 1093 |
+
pobj_list.append(token.text) # Collect prepositional objects
|
| 1094 |
+
|
| 1095 |
+
rs_list = rs.lower().split()
|
| 1096 |
+
if subject in rs_list and (obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list)):
|
| 1097 |
output.append("Other")
|
| 1098 |
+
elif subject in rs_list:
|
|
|
|
| 1099 |
output.append("VP")
|
| 1100 |
+
elif obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list):
|
|
|
|
| 1101 |
output.append("NP")
|
| 1102 |
else:
|
|
|
|
| 1103 |
output.append("Other")
|
| 1104 |
|
| 1105 |
'''Exp7'''
|
|
|
|
| 1177 |
# exit()
|
| 1178 |
'''LLM'''
|
| 1179 |
print(len(output))
|
| 1180 |
+
import re
|
| 1181 |
+
def clean_text(text):
|
| 1182 |
+
if isinstance(text, str):
|
| 1183 |
+
return re.sub(r'[^\x00-\x7F]+', '', text)
|
| 1184 |
+
return text
|
| 1185 |
+
|
| 1186 |
+
responses_df["Experiment"] = responses_df["Experiment"].apply(clean_text)
|
| 1187 |
+
responses_df["Question_ID"] = responses_df["Question_ID"].apply(clean_text)
|
| 1188 |
+
responses_df["Item"] = responses_df["Item"].apply(clean_text)
|
| 1189 |
+
responses_df["Response"] = responses_df["Response"].apply(clean_text)
|
| 1190 |
+
|
| 1191 |
+
output = [str(item) for item in output]
|
| 1192 |
+
|
| 1193 |
self.data = pd.DataFrame(list(
|
| 1194 |
+
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],output)),
|
| 1195 |
+
columns=["Experiment", "Question_ID", "Item", "Response","Coding"])
|
|
|
|
|
|
|
| 1196 |
|
| 1197 |
return self.data
|
| 1198 |
|
|
|
|
| 1202 |
|
| 1203 |
|
| 1204 |
|
| 1205 |
+
|
| 1206 |
+
|
| 1207 |
def calculate_js_divergence(self, file_path_1, file_path_2):
|
| 1208 |
"""
|
| 1209 |
Calculate the Jensen-Shannon divergence for response distributions between two datasets.
|
|
|
|
| 1211 |
removes the original E5 and E51, and then calculates the JS divergence between the datasets.
|
| 1212 |
|
| 1213 |
Parameters:
|
| 1214 |
+
file_path_1 (str): Path to the first dataset file (CSV format).
|
| 1215 |
file_path_2 (str): Path to the second dataset file (CSV format).
|
| 1216 |
|
| 1217 |
Returns:
|
|
|
|
| 1249 |
human_df = pd.concat([human_df, human_e5], ignore_index=True)
|
| 1250 |
llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
|
| 1251 |
|
|
|
|
| 1252 |
### Calculate Average JS Divergence ###
|
| 1253 |
|
|
|
|
| 1254 |
# Extract the relevant columns for JS divergence calculation
|
| 1255 |
human_responses = human_df[['Question_ID', 'Coding']]
|
| 1256 |
llm_responses = llm_df[['Question_ID', 'Coding']]
|
| 1257 |
|
| 1258 |
# Remove 'Other' responses
|
| 1259 |
+
#human_responses = human_responses[human_responses['Coding'] != 'Other']
|
| 1260 |
+
#llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
|
| 1261 |
|
| 1262 |
# Get unique Question_IDs present in both datasets
|
| 1263 |
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
|
|
|
| 1287 |
|
| 1288 |
# Calculate the average JS divergence per experiment and the confidence interval
|
| 1289 |
results = {}
|
| 1290 |
+
experiment_averages = []
|
| 1291 |
for exp, divs in js_divergence.items():
|
| 1292 |
avg_js_divergence = 1 - np.nanmean(divs)
|
| 1293 |
ci_lower, ci_upper = bootstrap((divs,), np.nanmean, confidence_level=0.95,
|
|
|
|
| 1296 |
'average_js_divergence': avg_js_divergence,
|
| 1297 |
'confidence_interval': (1 - ci_upper, 1 - ci_lower) # Adjust for 1 - score
|
| 1298 |
}
|
| 1299 |
+
experiment_averages.append(avg_js_divergence)
|
| 1300 |
|
| 1301 |
+
# Calculate the weighted average JS divergence across all experiments
|
| 1302 |
+
weighted_js_divergence = np.mean(experiment_averages) # Simple average over experiments
|
|
|
|
| 1303 |
|
| 1304 |
+
# Calculate the confidence interval for the overall JS divergence using bootstrap
|
| 1305 |
overall_ci_lower, overall_ci_upper = bootstrap(
|
| 1306 |
+
(experiment_averages,),
|
| 1307 |
np.nanmean,
|
| 1308 |
confidence_level=0.95,
|
| 1309 |
n_resamples=1000
|
|
|
|
| 1312 |
# Combine all results into one dictionary
|
| 1313 |
all_results = {
|
| 1314 |
'overall': {
|
| 1315 |
+
'average_js_divergence': weighted_js_divergence,
|
| 1316 |
+
'confidence_interval': (overall_ci_lower, overall_ci_upper)
|
| 1317 |
},
|
| 1318 |
'per_experiment': results
|
| 1319 |
}
|