Upload folder using huggingface_hub
Browse files- handle_stage4.py +179 -0
handle_stage4.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from tqdm import tqdm
|
4 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5 |
+
import os
|
6 |
+
import random
|
7 |
+
import concurrent.futures
|
8 |
+
from moviepy.editor import *
|
9 |
+
import os
|
10 |
+
import cv2
|
11 |
+
import concurrent.futures
|
12 |
+
|
13 |
+
def extract_frame(video_path):
|
14 |
+
# 打开视频文件
|
15 |
+
video_capture = cv2.VideoCapture(video_path)
|
16 |
+
# 跳到最后一帧
|
17 |
+
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1)
|
18 |
+
# 读取最后一帧
|
19 |
+
success, frame = video_capture.read()
|
20 |
+
if success:
|
21 |
+
# 保存帧
|
22 |
+
save_frame(video_path, frame)
|
23 |
+
else:
|
24 |
+
# 跳到倒数第二帧
|
25 |
+
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2)
|
26 |
+
# 读取倒数第二帧
|
27 |
+
success, frame = video_capture.read()
|
28 |
+
if success:
|
29 |
+
# 保存帧
|
30 |
+
save_frame(video_path, frame)
|
31 |
+
else:
|
32 |
+
print(f"无法读取最后一帧和倒数第二帧:{video_path}")
|
33 |
+
# 释放视频文件
|
34 |
+
video_capture.release()
|
35 |
+
|
36 |
+
def save_frame(video_path, frame):
|
37 |
+
# 获取视频文件名(不带扩展名)
|
38 |
+
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
39 |
+
# 设置图像保存路径
|
40 |
+
image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg")
|
41 |
+
# 检查图像文件是否已经存在
|
42 |
+
if not os.path.exists(image_path):
|
43 |
+
cv2.imwrite(image_path, frame)
|
44 |
+
print(f"保存最后一帧为 {image_path}")
|
45 |
+
else:
|
46 |
+
print(f"图像已存在:{video_name}")
|
47 |
+
|
48 |
+
|
49 |
+
def process_video(file_path):
|
50 |
+
try:
|
51 |
+
# 加载视频文件
|
52 |
+
clip = VideoFileClip(file_path)
|
53 |
+
# 检查视频是否包含音频
|
54 |
+
if clip.audio is not None:
|
55 |
+
# 提取音频
|
56 |
+
audio = clip.audio
|
57 |
+
# 保存为随机选择的格式,文件名和原视频文件相同,保存在相同的路径下
|
58 |
+
audio_format = random.choice(["mp3", "wav"])
|
59 |
+
audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}'
|
60 |
+
audio_file_wav = os.path.splitext(file_path)[0] + '.wav'
|
61 |
+
audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3'
|
62 |
+
if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3):
|
63 |
+
audio.write_audiofile(audio_file_path)
|
64 |
+
else:
|
65 |
+
print(f"file {audio_file_path} exit.")
|
66 |
+
# 关闭音频和剪辑对象
|
67 |
+
audio.close()
|
68 |
+
clip.close()
|
69 |
+
except Exception as e:
|
70 |
+
if "Resource temporarily unavailable" in str(e):
|
71 |
+
print(f"An error occurred while processing the file {file_path}: {e}")
|
72 |
+
time.sleep(20)
|
73 |
+
else:
|
74 |
+
print(f"An error occurred while processing the file {file_path}: {e}")
|
75 |
+
|
76 |
+
|
77 |
+
# Paths
|
78 |
+
data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/hf/stage4_next_json/video_stag4_0116_next.json'
|
79 |
+
audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
|
80 |
+
audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818'
|
81 |
+
video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
|
82 |
+
image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
|
83 |
+
new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/stage4_0119.json'
|
84 |
+
|
85 |
+
# Load JSON data
|
86 |
+
with open(data_json_path, 'r') as f:
|
87 |
+
data = json.load(f)
|
88 |
+
|
89 |
+
# Function to check if a file exists in a folder
|
90 |
+
def file_exists(folder, filename):
|
91 |
+
return os.path.exists(os.path.join(folder, filename))
|
92 |
+
|
93 |
+
# Initialize counters for missing and total files by type
|
94 |
+
file_counts = {
|
95 |
+
"video": {"total": 0, "missing": 0},
|
96 |
+
"audio_asr": {"total": 0, "missing": 0},
|
97 |
+
"audio_caption": {"total": 0, "missing": 0},
|
98 |
+
"image": {"total": 0, "missing": 0},
|
99 |
+
"unknown": {"total": 0, "missing": 0} # For items missing all types of files
|
100 |
+
}
|
101 |
+
|
102 |
+
# Helper function to process each item in the dataset
|
103 |
+
def process_item(item):
|
104 |
+
result = {"item": item, "valid": True, "missing": []}
|
105 |
+
found = False
|
106 |
+
|
107 |
+
if 'video' in item:
|
108 |
+
video_file = item['video']
|
109 |
+
file_counts["video"]["total"] += 1
|
110 |
+
found = True
|
111 |
+
video_path = os.path.join(video_folder, video_file)
|
112 |
+
process_video(video_path)
|
113 |
+
extract_frame(video_path)
|
114 |
+
|
115 |
+
if not video_file or not file_exists(video_folder, video_file):
|
116 |
+
result['missing'].append(f"Video file missing or not found: {video_file}")
|
117 |
+
result['valid'] = False
|
118 |
+
file_counts["video"]["missing"] += 1
|
119 |
+
|
120 |
+
if 'audio_asr' in item:
|
121 |
+
audio_asr_file = item['audio_asr']
|
122 |
+
file_counts["audio_asr"]["total"] += 1
|
123 |
+
found = True
|
124 |
+
if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
|
125 |
+
result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
|
126 |
+
result['valid'] = False
|
127 |
+
file_counts["audio_asr"]["missing"] += 1
|
128 |
+
|
129 |
+
if 'audio_caption' in item:
|
130 |
+
audio_caption_file = item['audio_caption']
|
131 |
+
file_counts["audio_caption"]["total"] += 1
|
132 |
+
found = True
|
133 |
+
if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
|
134 |
+
result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
|
135 |
+
result['valid'] = False
|
136 |
+
file_counts["audio_caption"]["missing"] += 1
|
137 |
+
|
138 |
+
if 'image' in item:
|
139 |
+
image_file = item['image']
|
140 |
+
file_counts["image"]["total"] += 1
|
141 |
+
found = True
|
142 |
+
if not image_file or not file_exists(image_folder, image_file):
|
143 |
+
result['missing'].append(f"Image file missing or not found: {image_file}")
|
144 |
+
result['valid'] = False
|
145 |
+
file_counts["image"]["missing"] += 1
|
146 |
+
|
147 |
+
if not found:
|
148 |
+
result['valid'] = False
|
149 |
+
file_counts["unknown"]["total"] += 1
|
150 |
+
file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found
|
151 |
+
|
152 |
+
return result
|
153 |
+
|
154 |
+
# List to store results
|
155 |
+
new_items = []
|
156 |
+
texts = []
|
157 |
+
|
158 |
+
# Use ThreadPoolExecutor for multithreaded processing
|
159 |
+
with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system
|
160 |
+
futures = {executor.submit(process_item, item): item for item in data}
|
161 |
+
|
162 |
+
for future in tqdm(as_completed(futures), total=len(futures)):
|
163 |
+
result = future.result()
|
164 |
+
if result['valid']:
|
165 |
+
new_items.append(result['item'])
|
166 |
+
else:
|
167 |
+
texts.append(result['item']) # Collect invalid items if needed
|
168 |
+
for missing in result['missing']:
|
169 |
+
print(missing)
|
170 |
+
|
171 |
+
# Save new_items to a JSON file
|
172 |
+
with open(new_json_path, 'w', encoding='utf-8') as f:
|
173 |
+
json.dump(new_items, f, ensure_ascii=False, indent=4)
|
174 |
+
|
175 |
+
# Print the summary of missing and total files by type
|
176 |
+
print(f"Saved {len(new_items)} valid items to {new_json_path}")
|
177 |
+
print(f"Total and missing files by type:")
|
178 |
+
for file_type, counts in file_counts.items():
|
179 |
+
print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")
|