Update README.md
Browse files
README.md
CHANGED
@@ -660,7 +660,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
660 |
<details>
|
661 |
<summary>Benchmarking Command</summary>
|
662 |
```
|
663 |
-
guidellm --model
|
664 |
```
|
665 |
|
666 |
</details>
|
@@ -695,7 +695,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
695 |
<tr>
|
696 |
<th rowspan="3" valign="top">A100</th>
|
697 |
<td>4</td>
|
698 |
-
<td>
|
699 |
<td></td>
|
700 |
<td>7.5</td>
|
701 |
<td>67</td>
|
@@ -706,7 +706,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
706 |
</tr>
|
707 |
<tr>
|
708 |
<td>2</td>
|
709 |
-
<td>
|
710 |
<td>1.86</td>
|
711 |
<td>8.1</td>
|
712 |
<td>124</td>
|
@@ -717,7 +717,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
717 |
</tr>
|
718 |
<tr>
|
719 |
<td>2</td>
|
720 |
-
<td>
|
721 |
<td>2.52</td>
|
722 |
<td>6.9</td>
|
723 |
<td>147</td>
|
@@ -729,7 +729,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
729 |
<tr>
|
730 |
<th rowspan="3" valign="top">H100</th>
|
731 |
<td>4</td>
|
732 |
-
<td>
|
733 |
<td></td>
|
734 |
<td>4.4</td>
|
735 |
<td>67</td>
|
@@ -740,7 +740,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
740 |
</tr>
|
741 |
<tr>
|
742 |
<td>2</td>
|
743 |
-
<td>
|
744 |
<td>1.82</td>
|
745 |
<td>4.7</td>
|
746 |
<td>120</td>
|
@@ -751,7 +751,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
751 |
</tr>
|
752 |
<tr>
|
753 |
<td>2</td>
|
754 |
-
<td>
|
755 |
<td>1.87</td>
|
756 |
<td>4.7</td>
|
757 |
<td>120</td>
|
@@ -794,7 +794,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
794 |
<tbody style="text-align: center">
|
795 |
<tr>
|
796 |
<th rowspan="3" valign="top">A100x4</th>
|
797 |
-
<td>
|
798 |
<td></td>
|
799 |
<td>0.4</td>
|
800 |
<td>222</td>
|
@@ -804,28 +804,28 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
804 |
<td>399</td>
|
805 |
</tr>
|
806 |
<tr>
|
807 |
-
<td>
|
808 |
<td>1.70</td>
|
809 |
<td>1.6</td>
|
810 |
-
<td>
|
811 |
<td>2.2</td>
|
812 |
-
<td>
|
813 |
<td>2.6</td>
|
814 |
-
<td>
|
815 |
</tr>
|
816 |
<tr>
|
817 |
-
<td>
|
818 |
<td>1.48</td>
|
819 |
<td>1.0</td>
|
820 |
-
<td>
|
821 |
<td>2.0</td>
|
822 |
-
<td>
|
823 |
<td>2.8</td>
|
824 |
-
<td>
|
825 |
</tr>
|
826 |
<tr>
|
827 |
<<th rowspan="3" valign="top">H100x4</th>
|
828 |
-
<td>
|
829 |
<td></td>
|
830 |
<td>1.0</td>
|
831 |
<td>284</td>
|
@@ -835,24 +835,24 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
835 |
<td>511</td>
|
836 |
</tr>
|
837 |
<tr>
|
838 |
-
<td>
|
839 |
<td>1.61</td>
|
840 |
<td>3.4</td>
|
841 |
-
<td>
|
842 |
<td>5.2</td>
|
843 |
-
<td>
|
844 |
<td>6.4</td>
|
845 |
-
<td>
|
846 |
</tr>
|
847 |
<tr>
|
848 |
-
<td>
|
849 |
<td>1.33</td>
|
850 |
<td>2.8</td>
|
851 |
-
<td>
|
852 |
<td>4.4</td>
|
853 |
-
<td>
|
854 |
<td>5.4</td>
|
855 |
-
<td>
|
856 |
</tr>
|
857 |
</tbody>
|
858 |
</table>
|
@@ -861,7 +861,7 @@ The following performance benchmarks were conducted with [vLLM](https://docs.vll
|
|
861 |
|
862 |
**QPS: Queries per second.
|
863 |
|
864 |
-
**QPD: Queries per dollar, based on on-demand cost at [Lambda Labs](https://lambdalabs.com/service/gpu-cloud) (observed on 2/18/2025).
|
865 |
|
866 |
## The Mistral AI Team
|
867 |
|
|
|
660 |
<details>
|
661 |
<summary>Benchmarking Command</summary>
|
662 |
```
|
663 |
+
guidellm --model neuralmagic/Pixtral-Large-Instruct-2411-hf-FP8-dynamic --target "http://localhost:8000/v1" --data-type emulated --data prompt_tokens=<prompt_tokens>,generated_tokens=<generated_tokens>,images=<num_images>,width=<image_width>,height=<image_height> --max seconds 120 --backend aiohttp_server
|
664 |
```
|
665 |
|
666 |
</details>
|
|
|
695 |
<tr>
|
696 |
<th rowspan="3" valign="top">A100</th>
|
697 |
<td>4</td>
|
698 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
699 |
<td></td>
|
700 |
<td>7.5</td>
|
701 |
<td>67</td>
|
|
|
706 |
</tr>
|
707 |
<tr>
|
708 |
<td>2</td>
|
709 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w8a8</td>
|
710 |
<td>1.86</td>
|
711 |
<td>8.1</td>
|
712 |
<td>124</td>
|
|
|
717 |
</tr>
|
718 |
<tr>
|
719 |
<td>2</td>
|
720 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
721 |
<td>2.52</td>
|
722 |
<td>6.9</td>
|
723 |
<td>147</td>
|
|
|
729 |
<tr>
|
730 |
<th rowspan="3" valign="top">H100</th>
|
731 |
<td>4</td>
|
732 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
733 |
<td></td>
|
734 |
<td>4.4</td>
|
735 |
<td>67</td>
|
|
|
740 |
</tr>
|
741 |
<tr>
|
742 |
<td>2</td>
|
743 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-FP8-Dynamic</td>
|
744 |
<td>1.82</td>
|
745 |
<td>4.7</td>
|
746 |
<td>120</td>
|
|
|
751 |
</tr>
|
752 |
<tr>
|
753 |
<td>2</td>
|
754 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
755 |
<td>1.87</td>
|
756 |
<td>4.7</td>
|
757 |
<td>120</td>
|
|
|
794 |
<tbody style="text-align: center">
|
795 |
<tr>
|
796 |
<th rowspan="3" valign="top">A100x4</th>
|
797 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
798 |
<td></td>
|
799 |
<td>0.4</td>
|
800 |
<td>222</td>
|
|
|
804 |
<td>399</td>
|
805 |
</tr>
|
806 |
<tr>
|
807 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w8a8</td>
|
808 |
<td>1.70</td>
|
809 |
<td>1.6</td>
|
810 |
+
<td>766</td>
|
811 |
<td>2.2</td>
|
812 |
+
<td>1142</td>
|
813 |
<td>2.6</td>
|
814 |
+
<td>1348</td>
|
815 |
</tr>
|
816 |
<tr>
|
817 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
818 |
<td>1.48</td>
|
819 |
<td>1.0</td>
|
820 |
+
<td>552</td>
|
821 |
<td>2.0</td>
|
822 |
+
<td>1010</td>
|
823 |
<td>2.8</td>
|
824 |
+
<td>1360</td>
|
825 |
</tr>
|
826 |
<tr>
|
827 |
<<th rowspan="3" valign="top">H100x4</th>
|
828 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf</td>
|
829 |
<td></td>
|
830 |
<td>1.0</td>
|
831 |
<td>284</td>
|
|
|
835 |
<td>511</td>
|
836 |
</tr>
|
837 |
<tr>
|
838 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-FP8-Dynamic</td>
|
839 |
<td>1.61</td>
|
840 |
<td>3.4</td>
|
841 |
+
<td>905</td>
|
842 |
<td>5.2</td>
|
843 |
+
<td>1406</td>
|
844 |
<td>6.4</td>
|
845 |
+
<td>1759</td>
|
846 |
</tr>
|
847 |
<tr>
|
848 |
+
<td>neuralmagic/Pixtral-Large-Instruct-2411-hf-quantized.w4a16</td>
|
849 |
<td>1.33</td>
|
850 |
<td>2.8</td>
|
851 |
+
<td>761</td>
|
852 |
<td>4.4</td>
|
853 |
+
<td>1228</td>
|
854 |
<td>5.4</td>
|
855 |
+
<td>1480</td>
|
856 |
</tr>
|
857 |
</tbody>
|
858 |
</table>
|
|
|
861 |
|
862 |
**QPS: Queries per second.
|
863 |
|
864 |
+
**QPD: Queries per dollar, based on on-demand cost at [Lambda Labs](https://lambdalabs.com/service/gpu-cloud) (observed on 2/18/2025).
|
865 |
|
866 |
## The Mistral AI Team
|
867 |
|