multitensor commited on
Commit
93cc042
·
verified ·
1 Parent(s): 67ea55d

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. handle_stage4.py +179 -0
handle_stage4.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from tqdm import tqdm
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ import os
6
+ import random
7
+ import concurrent.futures
8
+ from moviepy.editor import *
9
+ import os
10
+ import cv2
11
+ import concurrent.futures
12
+
13
+ def extract_frame(video_path):
14
+ # 打开视频文件
15
+ video_capture = cv2.VideoCapture(video_path)
16
+ # 跳到最后一帧
17
+ video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1)
18
+ # 读取最后一帧
19
+ success, frame = video_capture.read()
20
+ if success:
21
+ # 保存帧
22
+ save_frame(video_path, frame)
23
+ else:
24
+ # 跳到倒数第二帧
25
+ video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2)
26
+ # 读取倒数第二帧
27
+ success, frame = video_capture.read()
28
+ if success:
29
+ # 保存帧
30
+ save_frame(video_path, frame)
31
+ else:
32
+ print(f"无法读取最后一帧和倒数第二帧:{video_path}")
33
+ # 释放视频文件
34
+ video_capture.release()
35
+
36
+ def save_frame(video_path, frame):
37
+ # 获取视频文件名(不带扩展名)
38
+ video_name = os.path.splitext(os.path.basename(video_path))[0]
39
+ # 设置图像保存路径
40
+ image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg")
41
+ # 检查图像文件是否已经存在
42
+ if not os.path.exists(image_path):
43
+ cv2.imwrite(image_path, frame)
44
+ print(f"保存最后一帧为 {image_path}")
45
+ else:
46
+ print(f"图像已存在:{video_name}")
47
+
48
+
49
+ def process_video(file_path):
50
+ try:
51
+ # 加载视频文件
52
+ clip = VideoFileClip(file_path)
53
+ # 检查视频是否包含音频
54
+ if clip.audio is not None:
55
+ # 提取音频
56
+ audio = clip.audio
57
+ # 保存为随机选择的格式,文件名和原视频文件相同,保存在相同的路径下
58
+ audio_format = random.choice(["mp3", "wav"])
59
+ audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}'
60
+ audio_file_wav = os.path.splitext(file_path)[0] + '.wav'
61
+ audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3'
62
+ if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3):
63
+ audio.write_audiofile(audio_file_path)
64
+ else:
65
+ print(f"file {audio_file_path} exit.")
66
+ # 关闭音频和剪辑对象
67
+ audio.close()
68
+ clip.close()
69
+ except Exception as e:
70
+ if "Resource temporarily unavailable" in str(e):
71
+ print(f"An error occurred while processing the file {file_path}: {e}")
72
+ time.sleep(20)
73
+ else:
74
+ print(f"An error occurred while processing the file {file_path}: {e}")
75
+
76
+
77
+ # Paths
78
+ data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/hf/stage4_next_json/video_stag4_0116_next.json'
79
+ audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
80
+ audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818'
81
+ video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
82
+ image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
83
+ new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/stage4_0119.json'
84
+
85
+ # Load JSON data
86
+ with open(data_json_path, 'r') as f:
87
+ data = json.load(f)
88
+
89
+ # Function to check if a file exists in a folder
90
+ def file_exists(folder, filename):
91
+ return os.path.exists(os.path.join(folder, filename))
92
+
93
+ # Initialize counters for missing and total files by type
94
+ file_counts = {
95
+ "video": {"total": 0, "missing": 0},
96
+ "audio_asr": {"total": 0, "missing": 0},
97
+ "audio_caption": {"total": 0, "missing": 0},
98
+ "image": {"total": 0, "missing": 0},
99
+ "unknown": {"total": 0, "missing": 0} # For items missing all types of files
100
+ }
101
+
102
+ # Helper function to process each item in the dataset
103
+ def process_item(item):
104
+ result = {"item": item, "valid": True, "missing": []}
105
+ found = False
106
+
107
+ if 'video' in item:
108
+ video_file = item['video']
109
+ file_counts["video"]["total"] += 1
110
+ found = True
111
+ video_path = os.path.join(video_folder, video_file)
112
+ process_video(video_path)
113
+ extract_frame(video_path)
114
+
115
+ if not video_file or not file_exists(video_folder, video_file):
116
+ result['missing'].append(f"Video file missing or not found: {video_file}")
117
+ result['valid'] = False
118
+ file_counts["video"]["missing"] += 1
119
+
120
+ if 'audio_asr' in item:
121
+ audio_asr_file = item['audio_asr']
122
+ file_counts["audio_asr"]["total"] += 1
123
+ found = True
124
+ if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
125
+ result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
126
+ result['valid'] = False
127
+ file_counts["audio_asr"]["missing"] += 1
128
+
129
+ if 'audio_caption' in item:
130
+ audio_caption_file = item['audio_caption']
131
+ file_counts["audio_caption"]["total"] += 1
132
+ found = True
133
+ if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
134
+ result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
135
+ result['valid'] = False
136
+ file_counts["audio_caption"]["missing"] += 1
137
+
138
+ if 'image' in item:
139
+ image_file = item['image']
140
+ file_counts["image"]["total"] += 1
141
+ found = True
142
+ if not image_file or not file_exists(image_folder, image_file):
143
+ result['missing'].append(f"Image file missing or not found: {image_file}")
144
+ result['valid'] = False
145
+ file_counts["image"]["missing"] += 1
146
+
147
+ if not found:
148
+ result['valid'] = False
149
+ file_counts["unknown"]["total"] += 1
150
+ file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found
151
+
152
+ return result
153
+
154
+ # List to store results
155
+ new_items = []
156
+ texts = []
157
+
158
+ # Use ThreadPoolExecutor for multithreaded processing
159
+ with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system
160
+ futures = {executor.submit(process_item, item): item for item in data}
161
+
162
+ for future in tqdm(as_completed(futures), total=len(futures)):
163
+ result = future.result()
164
+ if result['valid']:
165
+ new_items.append(result['item'])
166
+ else:
167
+ texts.append(result['item']) # Collect invalid items if needed
168
+ for missing in result['missing']:
169
+ print(missing)
170
+
171
+ # Save new_items to a JSON file
172
+ with open(new_json_path, 'w', encoding='utf-8') as f:
173
+ json.dump(new_items, f, ensure_ascii=False, indent=4)
174
+
175
+ # Print the summary of missing and total files by type
176
+ print(f"Saved {len(new_items)} valid items to {new_json_path}")
177
+ print(f"Total and missing files by type:")
178
+ for file_type, counts in file_counts.items():
179
+ print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")