multitensor
/

omnis2

Safetensors

llava_llama

Model card Files Files and versions Community

multitensor commited on Dec 12, 2024

Commit

2564dd3

verified ·

1 Parent(s): b76588c

Upload handle_stage3.py with huggingface_hub

Browse files

Files changed (1) hide show

handle_stage3.py +105 -0

handle_stage3.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import json
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Paths
+data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage3/filtered_video_image_asr_caption_stage3.json'
+audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
+audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/audio_caption'
+video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
+image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
+new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1208.json'
+# Load JSON data
+with open(data_json_path, 'r') as f:
+    data = json.load(f)
+# Function to check if a file exists in a folder
+def file_exists(folder, filename):
+    return os.path.exists(os.path.join(folder, filename))
+# Initialize counters for missing and total files by type
+file_counts = {
+    "video": {"total": 0, "missing": 0},
+    "audio_asr": {"total": 0, "missing": 0},
+    "audio_caption": {"total": 0, "missing": 0},
+    "image": {"total": 0, "missing": 0},
+    "unknown": {"total": 0, "missing": 0}  # For items missing all types of files
+}
+# Helper function to process each item in the dataset
+def process_item(item):
+    result = {"item": item, "valid": True, "missing": []}
+    found = False
+    if 'video' in item:
+        video_file = item['video']
+        file_counts["video"]["total"] += 1
+        found = True
+        if not video_file or not file_exists(video_folder, video_file):
+            result['missing'].append(f"Video file missing or not found: {video_file}")
+            result['valid'] = False
+            file_counts["video"]["missing"] += 1
+    if 'audio_asr' in item:
+        audio_asr_file = item['audio_asr']
+        file_counts["audio_asr"]["total"] += 1
+        found = True
+        if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
+            result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
+            result['valid'] = False
+            file_counts["audio_asr"]["missing"] += 1
+    if 'audio_caption' in item:
+        audio_caption_file = item['audio_caption']
+        file_counts["audio_caption"]["total"] += 1
+        found = True
+        if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
+            result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
+            result['valid'] = False
+            file_counts["audio_caption"]["missing"] += 1
+    if 'image' in item:
+        image_file = item['image']
+        file_counts["image"]["total"] += 1
+        found = True
+        if not image_file or not file_exists(image_folder, image_file):
+            result['missing'].append(f"Image file missing or not found: {image_file}")
+            result['valid'] = False
+            file_counts["image"]["missing"] += 1
+    if not found:
+        result['valid'] = False
+        file_counts["unknown"]["total"] += 1
+        file_counts["unknown"]["missing"] += 1  # Count as unknown if no valid key is found
+    return result
+# List to store results
+new_items = []
+texts = []
+# Use ThreadPoolExecutor for multithreaded processing
+with ThreadPoolExecutor(max_workers=96) as executor:  # Adjust `max_workers` based on your system
+    futures = {executor.submit(process_item, item): item for item in data}
+    for future in tqdm(as_completed(futures), total=len(futures)):
+        result = future.result()
+        if result['valid']:
+            new_items.append(result['item'])
+        else:
+            texts.append(result['item'])  # Collect invalid items if needed
+            for missing in result['missing']:
+                print(missing)
+# Save new_items to a JSON file
+with open(new_json_path, 'w', encoding='utf-8') as f:
+    json.dump(new_items, f, ensure_ascii=False, indent=4)
+# Print the summary of missing and total files by type
+print(f"Saved {len(new_items)} valid items to {new_json_path}")
+print(f"Total and missing files by type:")
+for file_type, counts in file_counts.items():
+    print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")