giovanni-bonetta commited on
Commit
138d0d5
Β·
1 Parent(s): ff62d04

update gen and mc files, and app.py

Browse files
Files changed (3) hide show
  1. app.py +117 -15
  2. static/gen.csv +0 -0
  3. static/mc.csv +0 -0
app.py CHANGED
@@ -1,21 +1,123 @@
1
  import streamlit as st
2
- from datasets import load_dataset
3
- import tempfile
 
4
 
5
- # Load the dataset in streaming mode
6
- dataset = load_dataset("giobin/MAIA_2400", streaming=True)
7
 
8
- # Get an iterator over the dataset (assuming it's not too large)
9
- dataset_iter = iter(dataset["train"]) # Adjust split name if needed
10
 
11
- # Fetch the first video example
12
- video_example = next(dataset_iter)
13
- video_reader = video_example["video"] # Decord VideoReader object
14
 
15
- # Save the video temporarily
16
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_file:
17
- tmp_file.write(video_reader[:].asnumpy().tobytes()) # Convert frames to bytes
18
- video_path = tmp_file.name
 
 
 
 
 
 
19
 
20
- # Display the video
21
- st.video(video_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ from huggingface_hub import HfApi, hf_hub_download
5
 
6
+ HF_REPO = "giobin/MAIA_human_assessment_annotations"
7
+ CSV_FILENAME = "user_selections.csv"
8
 
9
+ def assign_samples(csv_path):
10
+ df = pd.read_csv(csv_path)
11
 
12
+ group_1 = df[(df["pool_pos"] == 1) & (~df["question_category"].str.endswith("_B"))].head(3)
13
+ group_2 = df[(df["pool_pos"] == 2) & (~df["question_category"].str.endswith("_B"))].head(3)
14
+ group_3 = df[(df["pool_pos"] == 3) & (~df["question_category"].str.endswith("_B"))].head(3)
15
 
16
+ return {
17
+ "Bernardo": group_1,
18
+ "Alessandro": group_1,
19
+ "Alessio": group_1,
20
+ "Lenci": group_2,
21
+ "Lucia": group_2,
22
+ "Davide": group_2,
23
+ "Giovanni": group_3,
24
+ "Raffaella": group_3,
25
+ }
26
 
27
+ def load_existing_annotations():
28
+ """Load the existing annotations from the HF dataset."""
29
+ try:
30
+ file_path = hf_hub_download(HF_REPO, CSV_FILENAME, repo_type="dataset")
31
+ return pd.read_csv(file_path)
32
+ except Exception:
33
+ return pd.DataFrame(columns=["username", "id"]) # Return empty DataFrame if not found
34
+
35
+ # Load datasets
36
+ csv_file = "static/mc.csv"
37
+ assignments = assign_samples(csv_file)
38
+ existing_annotations = load_existing_annotations()
39
+
40
+ valid_users = list(assignments.keys())
41
+
42
+ # Initialize session state
43
+ if "username" not in st.session_state:
44
+ st.session_state.username = None
45
+ if "index" not in st.session_state:
46
+ st.session_state.index = 0
47
+ if "results" not in st.session_state:
48
+ st.session_state.results = []
49
+
50
+ def update_name():
51
+ """Set username and reset index."""
52
+ st.session_state.username = st.session_state.selected_user
53
+ st.session_state.index = 0 # Reset progress
54
+
55
+ if st.session_state.username is None:
56
+ with st.form("user_form"):
57
+ st.write("### Select Your Name")
58
+ selected_user = st.selectbox("Choose your name:", valid_users, key="selected_user")
59
+ submit_button = st.form_submit_button("Start", on_click=update_name)
60
+ st.stop()
61
+
62
+ # Get assigned dataset and remove already labeled samples
63
+ full_dataset = assignments[st.session_state.username].reset_index(drop=True)
64
+ user_labeled_ids = existing_annotations[existing_annotations["username"] == st.session_state.username]["id"].tolist()
65
+ dataset = full_dataset[~full_dataset["id"].isin(user_labeled_ids)].reset_index(drop=True)
66
+
67
+ # If all samples are labeled, stop execution
68
+ if dataset.empty:
69
+ st.write("### Great! You have completed your assignment. πŸŽ‰")
70
+ st.stop()
71
+
72
+ def push_to_hf_hub(csv_path):
73
+ api = HfApi()
74
+ try:
75
+ api.create_repo(HF_REPO, repo_type="dataset", exist_ok=True)
76
+ api.upload_file(path_or_fileobj=csv_path, path_in_repo=CSV_FILENAME, repo_id=HF_REPO, repo_type="dataset")
77
+ print(f"Dataset updated: https://huggingface.co/datasets/{HF_REPO}")
78
+ except Exception as e:
79
+ print(f"Error pushing to HF: {e}")
80
+
81
+ def save_choice(choice_index):
82
+ sample = dataset.iloc[st.session_state.index]
83
+ st.session_state.results.append({
84
+ "username": st.session_state.username,
85
+ "id": sample["id"],
86
+ "video_id": sample["video_id"],
87
+ "answer1": sample["answer1"],
88
+ "answer2": sample["answer2"],
89
+ "selected_answer": choice_index,
90
+ "target": sample["target"],
91
+ "not_enough_info": not_enough_info
92
+ })
93
+
94
+ st.session_state.index += 1
95
+ st.session_state.checkbox = False # reset the checkbox
96
+
97
+ if st.session_state.index >= len(dataset): # All remaining samples done
98
+ st.write("### Great! You have completed your assignment. πŸŽ‰")
99
+ result_df = pd.DataFrame(st.session_state.results)
100
+ csv_path = "user_selections.csv"
101
+
102
+ if not existing_annotations.empty:
103
+ result_df = pd.concat([existing_annotations, result_df]).drop_duplicates(subset=["username", "id"], keep="last")
104
+
105
+ result_df.to_csv(csv_path, index=False)
106
+ push_to_hf_hub(csv_path)
107
+ st.stop()
108
+ return
109
+
110
+ # Select the current sample
111
+ sample = dataset.iloc[st.session_state.index]
112
+
113
+ # Display content
114
+ st.write(f"## Video Question Answering Sample - User: {st.session_state.username}")
115
+ st.write(f"**Question Category:** {sample.get('question_category', 'No category available')}")
116
+ st.video(sample["video_url"])
117
+
118
+ # Checkbox for uncertainty
119
+ not_enough_info = st.checkbox("The frame does not provide enough information to answer the question.", key='checkbox')
120
+
121
+ # Buttons
122
+ st.button(f"Select Answer 1: {sample.get('answer1', 'No answer1 available')}", on_click=lambda: save_choice(0))
123
+ st.button(f"Select Answer 2: {sample.get('answer2', 'No answer2 available')}", on_click=lambda: save_choice(1))
static/gen.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/mc.csv ADDED
The diff for this file is too large to render. See raw diff