backend

Runtime error

App Files Files Community

meg-huggingface commited on Jul 18, 2024

Commit

3d16b0d

1 Parent(s): 7d70d90

Inference endpoint figuring

Browse files

Files changed (1) hide show

src/backend/inference_endpoint.py +70 -51

src/backend/inference_endpoint.py CHANGED Viewed

@@ -6,83 +6,102 @@ from huggingface_hub import create_inference_endpoint, get_inference_endpoint
 from src.backend.run_toxicity_eval import get_generation
 from src.logging import setup_logger
 import requests
 logging.basicConfig(level=logging.DEBUG)
 logger = setup_logger(__name__)
-TIMEOUT=20
-# TODO: Handle case where endpoint returns an error (for example because of flash attention or not fitting into memory)
-def create_endpoint(endpoint_name, repository, framework="pytorch", task="text-generation", accelerator="gpu", vendor="aws", region="us-east-1", type="protected", instance_size="x4", instance_type="nvidia-l4"):
     logger.info("Creating endpoint %s..." % endpoint_name)
     # TODO(mm): Handle situation where it's paused
     try:
-        endpoint = create_inference_endpoint(endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type)
     except huggingface_hub.utils._errors.HfHubHTTPError as e:
-        endpoint = update_endpoint_exception(e, endpoint_name=endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type)
     except requests.exceptions.HTTPError as e:
-        endpoint = update_endpoint_exception(e, endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type)
     except Exception as e:
-        logger.debug("Hit error")
         logger.debug(e)
         sys.exit()
     endpoint.fetch()
-    logger.info("Endpoint status: %s." % (endpoint.status))
-    if endpoint.status == "scaledToZero":
         # Send a request to wake it up.
         get_generation(endpoint.url, "Wake up")
         sleep(TIMEOUT)
     i = 0
-    while endpoint.status in ["pending", "initializing"]:# aka, not in ["failed", "running"]
         if i >= 20:
             logger.info("Model failed to respond. Exiting.")
             sys.exit()
-        logger.debug("Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
         sleep(TIMEOUT)
         endpoint.fetch()
         logger.debug("Endpoint status: %s." % (endpoint.status))
         i += 1
-    logger.info("Endpoint created:")
-    logger.info(endpoint)
-    generation_url = endpoint.url
-    return generation_url
-def update_endpoint_exception(e, endpoint_name, repository, framework, task, accelerator, vendor, region, type, instance_size, instance_type):
-    logger.debug("Hit the following exception:")
-    logger.debug(e)
-    logger.debug("Attempting to continue.")
-    try:
-        endpoint = get_inference_endpoint(endpoint_name)
-        endpoint.update(repository=repository, framework=framework, task=task,
-                        accelerator=accelerator, instance_size=instance_size,
-                        instance_type=instance_type)
-    except huggingface_hub.utils._errors.BadRequestError as e:
-        logger.debug("Hit the following exception:")
-        logger.debug(e)
-        logger.debug("Attempting a new instance type.")
-        if instance_type == "nvidia-l4":
-            # Try a larger, different, more expensive GPU.
-            endpoint = create_inference_endpoint(endpoint_name,
-                                                 repository=repository,
-                                                 framework=framework, task=task,
-                                                 accelerator=accelerator,
-                                                 vendor=vendor, region=region,
-                                                 type=type,
-                                                 instance_size="x1",
-                                                 instance_type="nvidia-a100")
-        elif instance_type == "a100" and instance_size == "x1":
-            endpoint = create_inference_endpoint(endpoint_name,
-                                                 repository=repository,
-                                                 framework=framework, task=task,
-                                                 accelerator=accelerator,
-                                                 vendor=vendor, region=region,
-                                                 type=type,
-                                                 instance_size="x4",
-                                                 instance_type="nvidia-a10g")
-        else:
-            logger.info("Getting expensive to try to run this model without human oversight. Exiting.")
-            sys.exit()
     return endpoint
 if __name__ == '__main__':
-    generation_url = create_endpoint("this-is-a-test", "Qwen/Qwen2-7B")

 from src.backend.run_toxicity_eval import get_generation
 from src.logging import setup_logger
 import requests
 logging.basicConfig(level=logging.DEBUG)
 logger = setup_logger(__name__)
+TIMEOUT = 20
+def create_endpoint(endpoint_name, repository, framework='pytorch',
+                    task='text-generation', accelerator='gpu', vendor='aws',
+                    region='us-east-1', type='protected', instance_size='x4',
+                    instance_type='nvidia-l4'):
     logger.info("Creating endpoint %s..." % endpoint_name)
     # TODO(mm): Handle situation where it's paused
     try:
+        endpoint = create_inference_endpoint(endpoint_name,
+                                             repository=repository,
+                                             framework=framework, task=task,
+                                             accelerator=accelerator,
+                                             vendor=vendor, region=region,
+                                             type=type,
+                                             instance_size=instance_size,
+                                             instance_type=instance_type)
     except huggingface_hub.utils._errors.HfHubHTTPError as e:
+        # Workload with the same name already exists error.
+        # Use it again, just make sure it has the right settings.
+        logger.debug("Hit error:")
+        logger.debug(e)
+        logger.debug("Attempting to update with the given parameters.")
+        endpoint = get_inference_endpoint(endpoint_name)
+        endpoint.update(repository=repository,
+                        framework=framework, task=task,
+                        accelerator=accelerator,
+                        vendor=vendor, region=region,
+                        type=type,
+                        instance_size=instance_size,
+                        instance_type=instance_type)
     except requests.exceptions.HTTPError as e:
+        # Not enough compute, or wrong compute
+        logger.debug("Hit error:")
+        logger.debug(e)
+        endpoint = update_endpoint_exception(endpoint)
     except Exception as e:
+        logger.debug("Hit unaccounted-for error")
         logger.debug(e)
         sys.exit()
     endpoint.fetch()
+    logger.info("Endpoint status: %s." % endpoint.status)
+    if endpoint.status == 'scaledToZero':
         # Send a request to wake it up.
         get_generation(endpoint.url, "Wake up")
         sleep(TIMEOUT)
+    elif endpoint.status == 'failed':
+        logger.info("Endpoint failed, attempting to change compute.")
+        endpoint = update_endpoint_exception(endpoint)
+    wait_for_endpoint(endpoint)
+    if endpoint.status == 'failed':
+        logger.info("Endpoint failed, attempting to change compute.")
+        endpoint = update_endpoint_exception(endpoint)
+        wait_for_endpoint(endpoint)
+    logger.info("Endpoint created:")
+    logger.info(endpoint)
+    generation_url = endpoint.url
+    if generation_url is None:
+        logger.debug("Failed to create an endpoint. Exiting.")
+        sys.exit()
+    return generation_url
+def wait_for_endpoint(endpoint):
     i = 0
+    while endpoint.status in ['pending',
+                              'initializing']:  # not in ['failed', 'running', 'scaledToZero']
         if i >= 20:
             logger.info("Model failed to respond. Exiting.")
             sys.exit()
+        logger.debug(
+            "Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
         sleep(TIMEOUT)
         endpoint.fetch()
         logger.debug("Endpoint status: %s." % (endpoint.status))
         i += 1
+def update_endpoint_exception(endpoint):
+    raw_info = endpoint.raw
+    cur_instance_size = raw_info['compute']['instanceSize']
+    cur_instance_type = raw_info['compute']['instanceType']
+    if (cur_instance_type, cur_instance_size) == ('nvidia-l4', 'x4'):
+        endpoint.update(instance_size='x1', instance_type='nvidia-a100')
+    elif (cur_instance_type, cur_instance_size) == ('a100', 'x1'):
+        endpoint.update(instance_size='x4', instance_type='nvidia-a10g')
+    else:
+        logger.info(
+            "Getting expensive to try to run this model without human oversight. Exiting.")
+        sys.exit()
     return endpoint
 if __name__ == '__main__':
+    generation_url = create_endpoint('this-is-a-test', 'Qwen/Qwen2-7B')