multitensor commited on
Commit
2564dd3
·
verified ·
1 Parent(s): b76588c

Upload handle_stage3.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. handle_stage3.py +105 -0
handle_stage3.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from tqdm import tqdm
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+
6
+
7
+ # Paths
8
+ data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage3/filtered_video_image_asr_caption_stage3.json'
9
+ audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
10
+ audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/audio_caption'
11
+ video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
12
+ image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
13
+ new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1208.json'
14
+
15
+ # Load JSON data
16
+ with open(data_json_path, 'r') as f:
17
+ data = json.load(f)
18
+
19
+ # Function to check if a file exists in a folder
20
+ def file_exists(folder, filename):
21
+ return os.path.exists(os.path.join(folder, filename))
22
+
23
+ # Initialize counters for missing and total files by type
24
+ file_counts = {
25
+ "video": {"total": 0, "missing": 0},
26
+ "audio_asr": {"total": 0, "missing": 0},
27
+ "audio_caption": {"total": 0, "missing": 0},
28
+ "image": {"total": 0, "missing": 0},
29
+ "unknown": {"total": 0, "missing": 0} # For items missing all types of files
30
+ }
31
+
32
+ # Helper function to process each item in the dataset
33
+ def process_item(item):
34
+ result = {"item": item, "valid": True, "missing": []}
35
+ found = False
36
+
37
+ if 'video' in item:
38
+ video_file = item['video']
39
+ file_counts["video"]["total"] += 1
40
+ found = True
41
+ if not video_file or not file_exists(video_folder, video_file):
42
+ result['missing'].append(f"Video file missing or not found: {video_file}")
43
+ result['valid'] = False
44
+ file_counts["video"]["missing"] += 1
45
+
46
+ if 'audio_asr' in item:
47
+ audio_asr_file = item['audio_asr']
48
+ file_counts["audio_asr"]["total"] += 1
49
+ found = True
50
+ if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
51
+ result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
52
+ result['valid'] = False
53
+ file_counts["audio_asr"]["missing"] += 1
54
+
55
+ if 'audio_caption' in item:
56
+ audio_caption_file = item['audio_caption']
57
+ file_counts["audio_caption"]["total"] += 1
58
+ found = True
59
+ if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
60
+ result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
61
+ result['valid'] = False
62
+ file_counts["audio_caption"]["missing"] += 1
63
+
64
+ if 'image' in item:
65
+ image_file = item['image']
66
+ file_counts["image"]["total"] += 1
67
+ found = True
68
+ if not image_file or not file_exists(image_folder, image_file):
69
+ result['missing'].append(f"Image file missing or not found: {image_file}")
70
+ result['valid'] = False
71
+ file_counts["image"]["missing"] += 1
72
+
73
+ if not found:
74
+ result['valid'] = False
75
+ file_counts["unknown"]["total"] += 1
76
+ file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found
77
+
78
+ return result
79
+
80
+ # List to store results
81
+ new_items = []
82
+ texts = []
83
+
84
+ # Use ThreadPoolExecutor for multithreaded processing
85
+ with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system
86
+ futures = {executor.submit(process_item, item): item for item in data}
87
+
88
+ for future in tqdm(as_completed(futures), total=len(futures)):
89
+ result = future.result()
90
+ if result['valid']:
91
+ new_items.append(result['item'])
92
+ else:
93
+ texts.append(result['item']) # Collect invalid items if needed
94
+ for missing in result['missing']:
95
+ print(missing)
96
+
97
+ # Save new_items to a JSON file
98
+ with open(new_json_path, 'w', encoding='utf-8') as f:
99
+ json.dump(new_items, f, ensure_ascii=False, indent=4)
100
+
101
+ # Print the summary of missing and total files by type
102
+ print(f"Saved {len(new_items)} valid items to {new_json_path}")
103
+ print(f"Total and missing files by type:")
104
+ for file_type, counts in file_counts.items():
105
+ print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")