Hjgugugjhuhjggg commited on
Commit
9bc4091
·
verified ·
1 Parent(s): 0ca6312

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +375 -0
app.py CHANGED
@@ -333,5 +333,380 @@ def anonymize_ip():
333
 
334
  Thread(target=anonymize_ip).start()
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  if __name__ == "__main__":
337
  iface.launch(share=True)
 
333
 
334
  Thread(target=anonymize_ip).start()
335
 
336
+ if __name__ == "__main__":
337
+ iface.launch(share=True) from pydantic import BaseModel
338
+ from llama_cpp import Llama
339
+ from concurrent.futures import ThreadPoolExecutor, as_completed
340
+ import re
341
+ import os
342
+ from dotenv import load_dotenv
343
+ import spaces
344
+ import requests
345
+ import random
346
+ from faker import Faker
347
+ from fastapi import FastAPI, Request
348
+ from fastapi.responses import JSONResponse
349
+ from fastapi.middleware.cors import CORSMiddleware
350
+ from threading import Thread
351
+ from time import sleep
352
+ from fastapi.staticfiles import StaticFiles
353
+ import gradio as gr
354
+ from typing import Dict, Any, Tuple
355
+ from urllib.parse import urlparse
356
+
357
+ load_dotenv()
358
+
359
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
360
+
361
+ global_data = {
362
+ 'models': {},
363
+ }
364
+
365
+ model_configs = [
366
+ {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
367
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
368
+ {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
369
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
370
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
371
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
372
+ {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
373
+ {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
374
+ {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
375
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
376
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
377
+ {"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
378
+ {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
379
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
380
+ {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
381
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
382
+ {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
383
+ {"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
384
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
385
+ ]
386
+
387
+ class ModelManager:
388
+ def __init__(self):
389
+ self.models = {}
390
+
391
+ def load_model(self, model_config):
392
+ if model_config['name'] not in self.models:
393
+ try:
394
+ print(f"Loading model {model_config['name']}...")
395
+ self.models[model_config['name']] = Llama.from_pretrained(
396
+ repo_id=model_config['repo_id'],
397
+ filename=model_config['filename'],
398
+ use_auth_token=HUGGINGFACE_TOKEN
399
+ )
400
+ print(f"Model {model_config['name']} loaded successfully.")
401
+ except Exception as e:
402
+ print(f"Error loading model {model_config['name']}: {e}")
403
+
404
+ def load_all_models(self):
405
+ with ThreadPoolExecutor() as executor:
406
+ for config in model_configs:
407
+ executor.submit(self.load_model, config)
408
+ return self.models
409
+
410
+ model_manager = ModelManager()
411
+ global_data['models'] = model_manager.load_all_models()
412
+
413
+ class ChatRequest(BaseModel):
414
+ message: str
415
+
416
+ def normalize_input(input_text):
417
+ return input_text.strip()
418
+
419
+ def remove_duplicates(text):
420
+ text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
421
+ text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
422
+ text = text.replace('[/INST]', '')
423
+ lines = text.split('\n')
424
+ unique_lines = []
425
+ seen_lines = set()
426
+ for line in lines:
427
+ if line not in seen_lines:
428
+ unique_lines.append(line)
429
+ seen_lines.add(line)
430
+ return '\n'.join(unique_lines)
431
+
432
+ PROXY_URL = "https://uhhy-fsfsfs.hf.space/valid"
433
+
434
+ def get_random_proxy():
435
+ try:
436
+ response = requests.get(PROXY_URL)
437
+ proxies = response.text.splitlines()
438
+ return random.choice(proxies)
439
+ except Exception as e:
440
+ print(f"Error fetching proxy: {e}")
441
+ return None
442
+
443
+ fake = Faker()
444
+
445
+ def generate_fake_ip():
446
+ return fake.ipv4()
447
+
448
+ def get_random_user_agent():
449
+ user_agents = [
450
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
451
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
452
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
453
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
454
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7; rv:89.0) Gecko/20100101 Firefox/89.0",
455
+ "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
456
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1",
457
+ "Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1",
458
+ "Mozilla/5.0 (Android 11; Mobile; rv:89.0) Gecko/89.0 Firefox/89.0"
459
+ ]
460
+ return random.choice(user_agents)
461
+
462
+ def get_model_name_from_url(url: str) -> str:
463
+ """Extracts the model name from a Hugging Face model URL."""
464
+ parsed_url = urlparse(url)
465
+ path_parts = parsed_url.path.split('/')
466
+ if len(path_parts) >= 2:
467
+ return path_parts[-2]
468
+ else:
469
+ return "Unknown Model"
470
+
471
+ def get_model_config_by_name(model_name: str) -> Dict[str, Any]:
472
+ """Finds the model configuration based on the model name."""
473
+ for config in model_configs:
474
+ if config['name'] == model_name:
475
+ return config
476
+ return {} # Return an empty dictionary if not found
477
+
478
+ def load_model_from_url(url: str) -> Llama:
479
+ """Loads a Llama model from a Hugging Face model URL."""
480
+ model_name = get_model_name_from_url(url)
481
+ model_config = get_model_config_by_name(model_name)
482
+ if model_config:
483
+ try:
484
+ print(f"Loading model {model_name}...")
485
+ model = Llama.from_pretrained(
486
+ repo_id=model_config['repo_id'],
487
+ filename=model_config['filename'],
488
+ use_auth_token=HUGGINGFACE_TOKEN
489
+ )
490
+ print(f"Model {model_name} loaded successfully.")
491
+ return model
492
+ except Exception as e:
493
+ print(f"Error loading model {model_name}: {e}")
494
+ else:
495
+ print(f"Model configuration not found for {model_name}")
496
+ return None
497
+
498
+ async def generate_model_response(model: Llama, inputs: str) -> str:
499
+ """Generates a response from the model."""
500
+ try:
501
+ print(f"Generating response for model: {model}")
502
+ response = model(inputs)
503
+ print(f"Response from {model}: {response}")
504
+ return remove_duplicates(response['choices'][0]['text'])
505
+ except Exception as e:
506
+ print(f"Error with model: {e}")
507
+ return "Error generating response. Please try again later."
508
+
509
+ def remove_repetitive_responses(responses: Dict[str, str]) -> Dict[str, str]:
510
+ """Removes duplicate responses from a dictionary of model responses."""
511
+ unique_responses = {}
512
+ for model, response in responses.items():
513
+ if response not in unique_responses:
514
+ unique_responses[model] = response
515
+ return unique_responses
516
+
517
+ @spaces.GPU(
518
+ queue=False,
519
+ allow_gpu_memory=True,
520
+ timeout=0,
521
+ duration=0,
522
+ gpu_type='Tesla V100',
523
+ gpu_count=2,
524
+ gpu_memory_limit='32GB',
525
+ cpu_limit=4,
526
+ memory_limit='64GB',
527
+ retry=True,
528
+ retry_delay=30,
529
+ priority='high',
530
+ disk_limit='100GB',
531
+ scratch_space='/mnt/scratch',
532
+ network_bandwidth_limit='200Mbps',
533
+ internet_access=True,
534
+ precision='float16',
535
+ batch_size=128,
536
+ num_threads=16,
537
+ logging_level='DEBUG',
538
+ log_to_file=True,
539
+ alert_on_failure=True,
540
+ data_encryption=True,
541
+ env_variables={'CUDA_VISIBLE_DEVICES': '0'},
542
+ environment_type='conda',
543
+ enable_checkpointing=True,
544
+ resource_limits={'gpu': 'Tesla V100', 'cpu': 8, 'memory': '128GB'},
545
+ hyperparameter_tuning=True,
546
+ prefetch_data=True,
547
+ persistent_storage=True,
548
+ auto_scaling=True,
549
+ security_level='high',
550
+ task_priority='urgent',
551
+ retries_on_timeout=True,
552
+ file_system='nfs',
553
+ custom_metrics={'throughput': '300GB/s', 'latency': '10ms'},
554
+ gpu_utilization_logging=True,
555
+ job_isolation='container',
556
+ failure_strategy='retry',
557
+ gpu_memory_overcommit=True,
558
+ cpu_overcommit=True,
559
+ memory_overcommit=True,
560
+ enable_optimizations=True,
561
+ multi_gpu_strategy='data_parallel',
562
+ model_parallelism=True,
563
+ quantization='dynamic',
564
+ pruning='structured',
565
+ tensor_parallelism=True,
566
+ mixed_precision_training=True,
567
+ layerwise_lr_decay=True,
568
+ warmup_steps=500,
569
+ learning_rate_scheduler='cosine_annealing',
570
+ dropout_rate=0.3,
571
+ weight_decay=0.01,
572
+ gradient_accumulation_steps=8,
573
+ mixed_precision_loss_scale=128,
574
+ tensorboard_logging=True,
575
+ hyperparameter_search_space={'learning_rate': [1e-5, 1e-3], 'batch_size': [64, 256]},
576
+ early_stopping=True,
577
+ early_stopping_patience=10,
578
+ input_data_pipeline='tf.data',
579
+ batch_normalization=True,
580
+ activation_function='relu',
581
+ optimizer='adam',
582
+ gradient_clipping=1.0,
583
+ checkpoint_freq=10,
584
+ experiment_name='deep_model_training',
585
+ experiment_tags=['nlp', 'deep_learning'],
586
+ adaptive_lr=True,
587
+ learning_rate_max=0.01,
588
+ learning_rate_min=1e-6,
589
+ max_steps=100000,
590
+ tolerance=0.01,
591
+ logging_frequency=10,
592
+ profile_gpu=True,
593
+ profile_cpu=True,
594
+ debug_mode=True,
595
+ save_best_model=True,
596
+ evaluation_metric='accuracy',
597
+ job_preemption='enabled',
598
+ preemptible_resources=True,
599
+ grace_period=60,
600
+ resource_scheduling='fifo',
601
+ hyperparameter_optimization_algorithm='bayesian',
602
+ distributed_training=True,
603
+ multi_node_training=True,
604
+ max_retries=5,
605
+ log_level='INFO',
606
+ secure_socket_layer=True,
607
+ data_sharding=True,
608
+ distributed_optimizer='horovod',
609
+ mixed_precision_support=True,
610
+ fault_tolerance=True,
611
+ external_gpu_resources=True,
612
+ disk_cache=True,
613
+ backup_enabled=True,
614
+ backup_frequency='daily',
615
+ task_grouping='dynamic',
616
+ instance_type='high_memory',
617
+ instance_count=3,
618
+ task_runtime='hours',
619
+ adaptive_memory_allocation=True,
620
+ model_versioning=True,
621
+ multi_model_support=True,
622
+ batch_optimization=True,
623
+ memory_prefetch=True,
624
+ data_prefetch_threads=16,
625
+ network_optimization=True,
626
+ model_parallelism_strategy='pipeline',
627
+ verbose_logging=True,
628
+ lock_on_failure=True,
629
+ data_compression=True,
630
+ inference_mode='batch',
631
+ distributed_cache_enabled=True,
632
+ dynamic_batching=True,
633
+ model_deployment=True,
634
+ latency_optimization=True,
635
+ multi_region_deployment=True,
636
+ multi_user_support=True,
637
+ job_scheduling='auto',
638
+ max_job_count=100,
639
+ suspend_on_idle=True,
640
+ hyperparameter_search_algorithm='random',
641
+ job_priority_scaling=True,
642
+ quantum_computing_support=True,
643
+ dynamic_resource_scaling=True,
644
+ runtime_optimization=True,
645
+ checkpoint_interval='30min',
646
+ max_gpu_temperature=80,
647
+ scale_on_gpu_utilization=True,
648
+ worker_threads=8
649
+ )
650
+ async def process_message(message: str) -> Tuple[str, str]:
651
+ """Processes a user message and generates responses from multiple LLMs."""
652
+ inputs = normalize_input(message)
653
+
654
+ # Retrieve models from global_data and process responses
655
+ responses = {}
656
+ for model_name, model in global_data['models'].items():
657
+ responses[model_name] = await generate_model_response(model, inputs)
658
+
659
+ unique_responses = remove_repetitive_responses(responses)
660
+ formatted_response = ""
661
+ for model, response in unique_responses.items():
662
+ formatted_response += f"**{model}:**\n{response}\n\n"
663
+
664
+ curl_command = f"""
665
+ curl -X POST -H "Content-Type: application/json" \\
666
+ -d '{{"message": "{message}"}}' \\
667
+ http://localhost:7860/generate
668
+ """
669
+ return formatted_response, curl_command
670
+
671
+ app = FastAPI()
672
+
673
+ app.add_middleware(
674
+ CORSMiddleware,
675
+ allow_origins=["*"],
676
+ allow_credentials=True,
677
+ allow_methods=["*"],
678
+ allow_headers=["*"],
679
+ )
680
+
681
+ app.mount("/", StaticFiles(directory="public", html=True), name="static")
682
+
683
+ @app.post("/generate")
684
+ async def generate_response(request: Request):
685
+ """Handles API requests to generate responses."""
686
+ data = await request.json()
687
+ message = data.get("message")
688
+ if not message:
689
+ return JSONResponse(status_code=400, content={"error": "Message is required."})
690
+
691
+ response, _ = await process_message(message)
692
+ return JSONResponse(content={"response": response})
693
+
694
+ iface = gr.Interface(
695
+ fn=process_message,
696
+ inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
697
+ outputs=[gr.Markdown(), gr.Textbox(label="cURL command")],
698
+ title="Multi-Model LLM API",
699
+ description="Enter a message and get responses from multiple LLMs.",
700
+ )
701
+
702
+ def anonymize_ip():
703
+ """Continuously updates IP addresses to anonymize requests."""
704
+ while True:
705
+ sleep(0)
706
+ os.environ['HTTP_X_FORWARDED_FOR'] = generate_fake_ip()
707
+ os.environ['REMOTE_ADDR'] = generate_fake_ip()
708
+
709
+ Thread(target=anonymize_ip).start()
710
+
711
  if __name__ == "__main__":
712
  iface.launch(share=True)