Spaces:

InternRobotics
/

InternManip-eval-demo

Running

App Files Files Community

yiyang34 commited on 7 days ago

Commit

2c30e2d

1 Parent(s): 8eded48

parallel download from oss

Browse files

Files changed (2) hide show

app_utils.py +94 -3
requirements.txt +3 -1

app_utils.py CHANGED Viewed

@@ -14,6 +14,9 @@ import shutil
 from urllib.parse import urljoin
 import oss2
 from natsort import natsorted
 TMP_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
 os.makedirs(TMP_ROOT, exist_ok=True)
@@ -180,11 +183,65 @@ def list_oss_files(folder_path: str) -> List[str]:
             files.append(obj.key)
     return sorted(files, key=lambda x: os.path.splitext(x)[0])
 def download_oss_file(oss_path: str, local_path: str) -> bool:
     """从OSS下载文件到本地，返回是否成功"""
     try:
         result = bucket.get_object_to_file(oss_path, local_path)
-        print(f"下载: {oss_path}, {result.status}")
         return result.status == 200
     except Exception as e:
         print(f"下载失败: {e}")
@@ -254,12 +311,27 @@ def stream_simulation_results(result_folder: str, task_id: str, request: gr.Requ
             oss_files = list_oss_files(image_folder)
             new_files = [f for f in oss_files if f not in processed_files]
             for oss_path in new_files:
                 try:
                     # 下载文件到本地
                     filename = os.path.basename(oss_path)
                     local_path = os.path.join(local_image_dir, filename)
-                    download_oss_file(oss_path, local_path)
                     # 读取图片
                     frame = cv2.imread(local_path)
@@ -302,16 +374,30 @@ def create_video_segment(frames: List[np.ndarray], fps: int, width: int, height:
 def process_remaining_oss_images(oss_folder: str, local_dir: str, processed_files: set, frame_buffer: List[np.ndarray]):
     """处理OSS上剩余的图片"""
     try:
         oss_files = list_oss_files(oss_folder)
         new_files = [f for f in oss_files if f not in processed_files]
         for oss_path in new_files:
             try:
                 # 下载文件到本地
                 filename = os.path.basename(oss_path)
                 local_path = os.path.join(local_dir, filename)
-                download_oss_file(oss_path, local_path)
                 # 读取图片
                 frame = cv2.imread(local_path)
@@ -370,13 +456,18 @@ def get_task_status(task_id: str) -> dict:
     """
     查询任务状态
     """
     try:
         response = requests.get(
             f"{API_ENDPOINTS['query_status']}/{task_id}",
             timeout=5
         )
         return response.json()
     except Exception as e:
         return {"status": "error get_task_status", "message": str(e)}
 def terminate_task(task_id: str) -> Optional[dict]:

 from urllib.parse import urljoin
 import oss2
 from natsort import natsorted
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+import hashlib
 TMP_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
 os.makedirs(TMP_ROOT, exist_ok=True)
             files.append(obj.key)
     return sorted(files, key=lambda x: os.path.splitext(x)[0])
+def parallel_download_oss_files(
+    bucket,
+    oss_folder: str,
+    local_dir: str,
+    file_list: list[str],
+    max_workers: int = 5
+) -> bool:
+    """
+    极简版并行下载指定文件列表
+    参数:
+        bucket: OSS Bucket对象
+        oss_folder: OSS文件夹路径 (如 "path/to/folder/")
+        local_dir: 本地存储目录
+        file_list: 需要下载的文件相对路径列表 (如 ["img1.jpg", "sub/img2.png"])
+        max_workers: 最大并发数
+    """
+    def download_single_file(oss_path, local_path):
+        try:
+            bucket.get_object_to_file(oss_path, local_path)
+            return True
+        except Exception as e:
+            print(f"下载失败 {oss_path}: {str(e)}")
+            return False
+    # 确保本地目录存在
+    os.makedirs(local_dir, exist_ok=True)
+    # 准备下载任务
+    tasks = []
+    for file in file_list:
+        oss_path = f"{file.lstrip('/')}"
+        filename = os.path.basename(oss_path)
+        local_path = os.path.join(local_dir, filename)
+        # os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        tasks.append((oss_path, local_path))
+    # 并行下载
+    success_count = 0
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        for oss_path, local_path in tasks:
+            futures.append(executor.submit(download_single_file, oss_path, local_path))
+        # 进度条显示
+        for future in tqdm(as_completed(futures), total=len(tasks), desc="下载进度"):
+            if future.result():
+                success_count += 1
+    print(f"下载完成: {success_count}/{len(tasks)} 成功")
+    return success_count == len(tasks)
 def download_oss_file(oss_path: str, local_path: str) -> bool:
     """从OSS下载文件到本地，返回是否成功"""
+    start_time = time.time()  # 记录开始时间
     try:
         result = bucket.get_object_to_file(oss_path, local_path)
+        download_time = time.time() - start_time  # 计算下载耗时
+        print(f"下载: {oss_path}, 状态码: {result.status}, 耗时: {download_time:.2f}秒")
         return result.status == 200
     except Exception as e:
         print(f"下载失败: {e}")
             oss_files = list_oss_files(image_folder)
             new_files = [f for f in oss_files if f not in processed_files]
+            if len(new_files) != 0:
+                print(f"发现新文件: {len(new_files)} 个", new_files)
+                success = parallel_download_oss_files(
+                    bucket=bucket,
+                    oss_folder=image_folder + "/",
+                    local_dir=local_image_dir + "/",
+                    file_list=new_files,
+                    max_workers=5  # 根据网络带宽调整
+                )
+                if not success:
+                    raise gr.Error("无法从OSS同步图片文件")
+                # if not download_oss_files_with_ossutil(image_folder + "/", local_image_dir + "/"):
+                #     raise gr.Error("无法从OSS同步图片文件")
             for oss_path in new_files:
                 try:
                     # 下载文件到本地
                     filename = os.path.basename(oss_path)
                     local_path = os.path.join(local_image_dir, filename)
+                    # download_oss_file(oss_path, local_path)
                     # 读取图片
                     frame = cv2.imread(local_path)
 def process_remaining_oss_images(oss_folder: str, local_dir: str, processed_files: set, frame_buffer: List[np.ndarray]):
     """处理OSS上剩余的图片"""
     try:
         oss_files = list_oss_files(oss_folder)
         new_files = [f for f in oss_files if f not in processed_files]
+        if len(new_files) != 0:
+            print(f"发现新文件: {len(new_files)} 个", new_files)
+            success = parallel_download_oss_files(
+                bucket=bucket,
+                oss_folder=oss_folder + "/",
+                local_dir=local_dir + "/",
+                file_list=new_files,
+                max_workers=5  # 根据网络带宽调整
+            )
+            if not success:
+                raise gr.Error("无法从OSS同步图片文件")
         for oss_path in new_files:
             try:
                 # 下载文件到本地
                 filename = os.path.basename(oss_path)
                 local_path = os.path.join(local_dir, filename)
+                # download_oss_file(oss_path, local_path)
                 # 读取图片
                 frame = cv2.imread(local_path)
     """
     查询任务状态
     """
+    start_time = time.time()
     try:
         response = requests.get(
             f"{API_ENDPOINTS['query_status']}/{task_id}",
             timeout=5
         )
+        elapsed_time = time.time() - start_time  # 计算耗时
+        print(f"[查询任务状态] task_id: {task_id}, 耗时: {elapsed_time:.3f}s")
         return response.json()
     except Exception as e:
+        elapsed_time = time.time() - start_time  # 计算失败耗时
+        print(f"[查询任务状态失败] task_id: {task_id}, 错误: {str(e)}, 耗时: {elapsed_time:.3f}s")
         return {"status": "error get_task_status", "message": str(e)}
 def terminate_task(task_id: str) -> Optional[dict]:

requirements.txt CHANGED Viewed

@@ -5,4 +5,6 @@ opencv-python
 numpy
 python-dateutil
 oss2
-natsort

 numpy
 python-dateutil
 oss2
+natsort
+concurrent-log-handler
+tqdm