diff --git "a/README.md" "b/README.md" --- "a/README.md" +++ "b/README.md" @@ -1,396 +1,2851 @@ --- +library_name: sentence-transformers +pipeline_tag: sentence-similarity tags: -- setfit -- sentence-transformers -- text-classification -- generated_from_setfit_trainer -widget: -- text: Insert a descriptive caption for the image -- text: Make the photo lighter -- text: Move the logo to the top layer -- text: Make the shape smaller -- text: Flip the icon vertically -metrics: -- accuracy -pipeline_tag: text-classification -library_name: setfit -inference: true -base_model: nomic-ai/nomic-embed-text-v1.5 +- feature-extraction +- sentence-similarity +- mteb +- transformers +- transformers.js model-index: -- name: SetFit with nomic-ai/nomic-embed-text-v1.5 +- name: epoch_0_model results: - task: - type: text-classification - name: Text Classification + type: Classification dataset: - name: Unknown - type: unknown + type: mteb/amazon_counterfactual + name: MTEB AmazonCounterfactualClassification (en) + config: en split: test + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 metrics: - type: accuracy - value: 0.5701459034792368 - name: Accuracy + value: 75.20895522388058 + - type: ap + value: 38.57605549557802 + - type: f1 + value: 69.35586565857854 + - task: + type: Classification + dataset: + type: mteb/amazon_polarity + name: MTEB AmazonPolarityClassification + config: default + split: test + revision: e2d317d38cd51312af73b3d32a06d1a08b442046 + metrics: + - type: accuracy + value: 91.8144 + - type: ap + value: 88.65222882032363 + - type: f1 + value: 91.80426301643274 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (en) + config: en + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 47.162000000000006 + - type: f1 + value: 46.59329642263158 + - task: + type: Retrieval + dataset: + type: arguana + name: MTEB ArguAna + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 24.253 + - type: map_at_10 + value: 38.962 + - type: map_at_100 + value: 40.081 + - type: map_at_1000 + value: 40.089000000000006 + - type: map_at_3 + value: 33.499 + - type: map_at_5 + value: 36.351 + - type: mrr_at_1 + value: 24.609 + - type: mrr_at_10 + value: 39.099000000000004 + - type: mrr_at_100 + value: 40.211000000000006 + - type: mrr_at_1000 + value: 40.219 + - type: mrr_at_3 + value: 33.677 + - type: mrr_at_5 + value: 36.469 + - type: ndcg_at_1 + value: 24.253 + - type: ndcg_at_10 + value: 48.010999999999996 + - type: ndcg_at_100 + value: 52.756 + - type: ndcg_at_1000 + value: 52.964999999999996 + - type: ndcg_at_3 + value: 36.564 + - type: ndcg_at_5 + value: 41.711999999999996 + - type: precision_at_1 + value: 24.253 + - type: precision_at_10 + value: 7.738 + - type: precision_at_100 + value: 0.98 + - type: precision_at_1000 + value: 0.1 + - type: precision_at_3 + value: 15.149000000000001 + - type: precision_at_5 + value: 11.593 + - type: recall_at_1 + value: 24.253 + - type: recall_at_10 + value: 77.383 + - type: recall_at_100 + value: 98.009 + - type: recall_at_1000 + value: 99.644 + - type: recall_at_3 + value: 45.448 + - type: recall_at_5 + value: 57.965999999999994 + - task: + type: Clustering + dataset: + type: mteb/arxiv-clustering-p2p + name: MTEB ArxivClusteringP2P + config: default + split: test + revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d + metrics: + - type: v_measure + value: 45.69069567851087 + - task: + type: Clustering + dataset: + type: mteb/arxiv-clustering-s2s + name: MTEB ArxivClusteringS2S + config: default + split: test + revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53 + metrics: + - type: v_measure + value: 36.35185490976283 + - task: + type: Reranking + dataset: + type: mteb/askubuntudupquestions-reranking + name: MTEB AskUbuntuDupQuestions + config: default + split: test + revision: 2000358ca161889fa9c082cb41daa8dcfb161a54 + metrics: + - type: map + value: 61.71274951450321 + - type: mrr + value: 76.06032625423207 + - task: + type: STS + dataset: + type: mteb/biosses-sts + name: MTEB BIOSSES + config: default + split: test + revision: d3fb88f8f02e40887cd149695127462bbcf29b4a + metrics: + - type: cos_sim_pearson + value: 86.73980520022269 + - type: cos_sim_spearman + value: 84.24649792685918 + - type: euclidean_pearson + value: 85.85197641158186 + - type: euclidean_spearman + value: 84.24649792685918 + - type: manhattan_pearson + value: 86.26809552711346 + - type: manhattan_spearman + value: 84.56397504030865 + - task: + type: Classification + dataset: + type: mteb/banking77 + name: MTEB Banking77Classification + config: default + split: test + revision: 0fd18e25b25c072e09e0d92ab615fda904d66300 + metrics: + - type: accuracy + value: 84.25324675324674 + - type: f1 + value: 84.17872280892557 + - task: + type: Clustering + dataset: + type: mteb/biorxiv-clustering-p2p + name: MTEB BiorxivClusteringP2P + config: default + split: test + revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40 + metrics: + - type: v_measure + value: 38.770253446400886 + - task: + type: Clustering + dataset: + type: mteb/biorxiv-clustering-s2s + name: MTEB BiorxivClusteringS2S + config: default + split: test + revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908 + metrics: + - type: v_measure + value: 32.94307095497281 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackAndroidRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 32.164 + - type: map_at_10 + value: 42.641 + - type: map_at_100 + value: 43.947 + - type: map_at_1000 + value: 44.074999999999996 + - type: map_at_3 + value: 39.592 + - type: map_at_5 + value: 41.204 + - type: mrr_at_1 + value: 39.628 + - type: mrr_at_10 + value: 48.625 + - type: mrr_at_100 + value: 49.368 + - type: mrr_at_1000 + value: 49.413000000000004 + - type: mrr_at_3 + value: 46.400000000000006 + - type: mrr_at_5 + value: 47.68 + - type: ndcg_at_1 + value: 39.628 + - type: ndcg_at_10 + value: 48.564 + - type: ndcg_at_100 + value: 53.507000000000005 + - type: ndcg_at_1000 + value: 55.635999999999996 + - type: ndcg_at_3 + value: 44.471 + - type: ndcg_at_5 + value: 46.137 + - type: precision_at_1 + value: 39.628 + - type: precision_at_10 + value: 8.856 + - type: precision_at_100 + value: 1.429 + - type: precision_at_1000 + value: 0.191 + - type: precision_at_3 + value: 21.268 + - type: precision_at_5 + value: 14.649000000000001 + - type: recall_at_1 + value: 32.164 + - type: recall_at_10 + value: 59.609 + - type: recall_at_100 + value: 80.521 + - type: recall_at_1000 + value: 94.245 + - type: recall_at_3 + value: 46.521 + - type: recall_at_5 + value: 52.083999999999996 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackEnglishRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 31.526 + - type: map_at_10 + value: 41.581 + - type: map_at_100 + value: 42.815999999999995 + - type: map_at_1000 + value: 42.936 + - type: map_at_3 + value: 38.605000000000004 + - type: map_at_5 + value: 40.351 + - type: mrr_at_1 + value: 39.489999999999995 + - type: mrr_at_10 + value: 47.829 + - type: mrr_at_100 + value: 48.512 + - type: mrr_at_1000 + value: 48.552 + - type: mrr_at_3 + value: 45.754 + - type: mrr_at_5 + value: 46.986 + - type: ndcg_at_1 + value: 39.489999999999995 + - type: ndcg_at_10 + value: 47.269 + - type: ndcg_at_100 + value: 51.564 + - type: ndcg_at_1000 + value: 53.53099999999999 + - type: ndcg_at_3 + value: 43.301 + - type: ndcg_at_5 + value: 45.239000000000004 + - type: precision_at_1 + value: 39.489999999999995 + - type: precision_at_10 + value: 8.93 + - type: precision_at_100 + value: 1.415 + - type: precision_at_1000 + value: 0.188 + - type: precision_at_3 + value: 20.892 + - type: precision_at_5 + value: 14.865999999999998 + - type: recall_at_1 + value: 31.526 + - type: recall_at_10 + value: 56.76 + - type: recall_at_100 + value: 75.029 + - type: recall_at_1000 + value: 87.491 + - type: recall_at_3 + value: 44.786 + - type: recall_at_5 + value: 50.254 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackGamingRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 40.987 + - type: map_at_10 + value: 52.827 + - type: map_at_100 + value: 53.751000000000005 + - type: map_at_1000 + value: 53.81 + - type: map_at_3 + value: 49.844 + - type: map_at_5 + value: 51.473 + - type: mrr_at_1 + value: 46.833999999999996 + - type: mrr_at_10 + value: 56.389 + - type: mrr_at_100 + value: 57.003 + - type: mrr_at_1000 + value: 57.034 + - type: mrr_at_3 + value: 54.17999999999999 + - type: mrr_at_5 + value: 55.486999999999995 + - type: ndcg_at_1 + value: 46.833999999999996 + - type: ndcg_at_10 + value: 58.372 + - type: ndcg_at_100 + value: 62.068 + - type: ndcg_at_1000 + value: 63.288 + - type: ndcg_at_3 + value: 53.400000000000006 + - type: ndcg_at_5 + value: 55.766000000000005 + - type: precision_at_1 + value: 46.833999999999996 + - type: precision_at_10 + value: 9.191 + - type: precision_at_100 + value: 1.192 + - type: precision_at_1000 + value: 0.134 + - type: precision_at_3 + value: 23.448 + - type: precision_at_5 + value: 15.862000000000002 + - type: recall_at_1 + value: 40.987 + - type: recall_at_10 + value: 71.146 + - type: recall_at_100 + value: 87.035 + - type: recall_at_1000 + value: 95.633 + - type: recall_at_3 + value: 58.025999999999996 + - type: recall_at_5 + value: 63.815999999999995 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackGisRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 24.587 + - type: map_at_10 + value: 33.114 + - type: map_at_100 + value: 34.043 + - type: map_at_1000 + value: 34.123999999999995 + - type: map_at_3 + value: 30.45 + - type: map_at_5 + value: 31.813999999999997 + - type: mrr_at_1 + value: 26.554 + - type: mrr_at_10 + value: 35.148 + - type: mrr_at_100 + value: 35.926 + - type: mrr_at_1000 + value: 35.991 + - type: mrr_at_3 + value: 32.599000000000004 + - type: mrr_at_5 + value: 33.893 + - type: ndcg_at_1 + value: 26.554 + - type: ndcg_at_10 + value: 38.132 + - type: ndcg_at_100 + value: 42.78 + - type: ndcg_at_1000 + value: 44.919 + - type: ndcg_at_3 + value: 32.833 + - type: ndcg_at_5 + value: 35.168 + - type: precision_at_1 + value: 26.554 + - type: precision_at_10 + value: 5.921 + - type: precision_at_100 + value: 0.8659999999999999 + - type: precision_at_1000 + value: 0.109 + - type: precision_at_3 + value: 13.861 + - type: precision_at_5 + value: 9.605 + - type: recall_at_1 + value: 24.587 + - type: recall_at_10 + value: 51.690000000000005 + - type: recall_at_100 + value: 73.428 + - type: recall_at_1000 + value: 89.551 + - type: recall_at_3 + value: 37.336999999999996 + - type: recall_at_5 + value: 43.047000000000004 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackMathematicaRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 16.715 + - type: map_at_10 + value: 24.251 + - type: map_at_100 + value: 25.326999999999998 + - type: map_at_1000 + value: 25.455 + - type: map_at_3 + value: 21.912000000000003 + - type: map_at_5 + value: 23.257 + - type: mrr_at_1 + value: 20.274 + - type: mrr_at_10 + value: 28.552 + - type: mrr_at_100 + value: 29.42 + - type: mrr_at_1000 + value: 29.497 + - type: mrr_at_3 + value: 26.14 + - type: mrr_at_5 + value: 27.502 + - type: ndcg_at_1 + value: 20.274 + - type: ndcg_at_10 + value: 29.088 + - type: ndcg_at_100 + value: 34.293 + - type: ndcg_at_1000 + value: 37.271 + - type: ndcg_at_3 + value: 24.708 + - type: ndcg_at_5 + value: 26.809 + - type: precision_at_1 + value: 20.274 + - type: precision_at_10 + value: 5.361 + - type: precision_at_100 + value: 0.915 + - type: precision_at_1000 + value: 0.13 + - type: precision_at_3 + value: 11.733 + - type: precision_at_5 + value: 8.556999999999999 + - type: recall_at_1 + value: 16.715 + - type: recall_at_10 + value: 39.587 + - type: recall_at_100 + value: 62.336000000000006 + - type: recall_at_1000 + value: 83.453 + - type: recall_at_3 + value: 27.839999999999996 + - type: recall_at_5 + value: 32.952999999999996 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackPhysicsRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 28.793000000000003 + - type: map_at_10 + value: 38.582 + - type: map_at_100 + value: 39.881 + - type: map_at_1000 + value: 39.987 + - type: map_at_3 + value: 35.851 + - type: map_at_5 + value: 37.289 + - type: mrr_at_1 + value: 34.455999999999996 + - type: mrr_at_10 + value: 43.909 + - type: mrr_at_100 + value: 44.74 + - type: mrr_at_1000 + value: 44.786 + - type: mrr_at_3 + value: 41.659 + - type: mrr_at_5 + value: 43.010999999999996 + - type: ndcg_at_1 + value: 34.455999999999996 + - type: ndcg_at_10 + value: 44.266 + - type: ndcg_at_100 + value: 49.639 + - type: ndcg_at_1000 + value: 51.644 + - type: ndcg_at_3 + value: 39.865 + - type: ndcg_at_5 + value: 41.887 + - type: precision_at_1 + value: 34.455999999999996 + - type: precision_at_10 + value: 7.843999999999999 + - type: precision_at_100 + value: 1.243 + - type: precision_at_1000 + value: 0.158 + - type: precision_at_3 + value: 18.831999999999997 + - type: precision_at_5 + value: 13.147 + - type: recall_at_1 + value: 28.793000000000003 + - type: recall_at_10 + value: 55.68300000000001 + - type: recall_at_100 + value: 77.99000000000001 + - type: recall_at_1000 + value: 91.183 + - type: recall_at_3 + value: 43.293 + - type: recall_at_5 + value: 48.618 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackProgrammersRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 25.907000000000004 + - type: map_at_10 + value: 35.519 + - type: map_at_100 + value: 36.806 + - type: map_at_1000 + value: 36.912 + - type: map_at_3 + value: 32.748 + - type: map_at_5 + value: 34.232 + - type: mrr_at_1 + value: 31.621 + - type: mrr_at_10 + value: 40.687 + - type: mrr_at_100 + value: 41.583 + - type: mrr_at_1000 + value: 41.638999999999996 + - type: mrr_at_3 + value: 38.527 + - type: mrr_at_5 + value: 39.612 + - type: ndcg_at_1 + value: 31.621 + - type: ndcg_at_10 + value: 41.003 + - type: ndcg_at_100 + value: 46.617999999999995 + - type: ndcg_at_1000 + value: 48.82 + - type: ndcg_at_3 + value: 36.542 + - type: ndcg_at_5 + value: 38.368 + - type: precision_at_1 + value: 31.621 + - type: precision_at_10 + value: 7.396999999999999 + - type: precision_at_100 + value: 1.191 + - type: precision_at_1000 + value: 0.153 + - type: precision_at_3 + value: 17.39 + - type: precision_at_5 + value: 12.1 + - type: recall_at_1 + value: 25.907000000000004 + - type: recall_at_10 + value: 52.115 + - type: recall_at_100 + value: 76.238 + - type: recall_at_1000 + value: 91.218 + - type: recall_at_3 + value: 39.417 + - type: recall_at_5 + value: 44.435 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 25.732166666666668 + - type: map_at_10 + value: 34.51616666666667 + - type: map_at_100 + value: 35.67241666666666 + - type: map_at_1000 + value: 35.78675 + - type: map_at_3 + value: 31.953416666666662 + - type: map_at_5 + value: 33.333 + - type: mrr_at_1 + value: 30.300166666666673 + - type: mrr_at_10 + value: 38.6255 + - type: mrr_at_100 + value: 39.46183333333334 + - type: mrr_at_1000 + value: 39.519999999999996 + - type: mrr_at_3 + value: 36.41299999999999 + - type: mrr_at_5 + value: 37.6365 + - type: ndcg_at_1 + value: 30.300166666666673 + - type: ndcg_at_10 + value: 39.61466666666667 + - type: ndcg_at_100 + value: 44.60808333333334 + - type: ndcg_at_1000 + value: 46.91708333333334 + - type: ndcg_at_3 + value: 35.26558333333333 + - type: ndcg_at_5 + value: 37.220000000000006 + - type: precision_at_1 + value: 30.300166666666673 + - type: precision_at_10 + value: 6.837416666666667 + - type: precision_at_100 + value: 1.10425 + - type: precision_at_1000 + value: 0.14875 + - type: precision_at_3 + value: 16.13716666666667 + - type: precision_at_5 + value: 11.2815 + - type: recall_at_1 + value: 25.732166666666668 + - type: recall_at_10 + value: 50.578916666666665 + - type: recall_at_100 + value: 72.42183333333334 + - type: recall_at_1000 + value: 88.48766666666667 + - type: recall_at_3 + value: 38.41325 + - type: recall_at_5 + value: 43.515750000000004 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackStatsRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 23.951 + - type: map_at_10 + value: 30.974 + - type: map_at_100 + value: 31.804 + - type: map_at_1000 + value: 31.900000000000002 + - type: map_at_3 + value: 28.762 + - type: map_at_5 + value: 29.94 + - type: mrr_at_1 + value: 26.534000000000002 + - type: mrr_at_10 + value: 33.553 + - type: mrr_at_100 + value: 34.297 + - type: mrr_at_1000 + value: 34.36 + - type: mrr_at_3 + value: 31.391000000000002 + - type: mrr_at_5 + value: 32.525999999999996 + - type: ndcg_at_1 + value: 26.534000000000002 + - type: ndcg_at_10 + value: 35.112 + - type: ndcg_at_100 + value: 39.28 + - type: ndcg_at_1000 + value: 41.723 + - type: ndcg_at_3 + value: 30.902 + - type: ndcg_at_5 + value: 32.759 + - type: precision_at_1 + value: 26.534000000000002 + - type: precision_at_10 + value: 5.445 + - type: precision_at_100 + value: 0.819 + - type: precision_at_1000 + value: 0.11 + - type: precision_at_3 + value: 12.986 + - type: precision_at_5 + value: 9.049 + - type: recall_at_1 + value: 23.951 + - type: recall_at_10 + value: 45.24 + - type: recall_at_100 + value: 64.12299999999999 + - type: recall_at_1000 + value: 82.28999999999999 + - type: recall_at_3 + value: 33.806000000000004 + - type: recall_at_5 + value: 38.277 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackTexRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 16.829 + - type: map_at_10 + value: 23.684 + - type: map_at_100 + value: 24.683 + - type: map_at_1000 + value: 24.81 + - type: map_at_3 + value: 21.554000000000002 + - type: map_at_5 + value: 22.768 + - type: mrr_at_1 + value: 20.096 + - type: mrr_at_10 + value: 27.230999999999998 + - type: mrr_at_100 + value: 28.083999999999996 + - type: mrr_at_1000 + value: 28.166000000000004 + - type: mrr_at_3 + value: 25.212 + - type: mrr_at_5 + value: 26.32 + - type: ndcg_at_1 + value: 20.096 + - type: ndcg_at_10 + value: 27.989000000000004 + - type: ndcg_at_100 + value: 32.847 + - type: ndcg_at_1000 + value: 35.896 + - type: ndcg_at_3 + value: 24.116 + - type: ndcg_at_5 + value: 25.964 + - type: precision_at_1 + value: 20.096 + - type: precision_at_10 + value: 5 + - type: precision_at_100 + value: 0.8750000000000001 + - type: precision_at_1000 + value: 0.131 + - type: precision_at_3 + value: 11.207 + - type: precision_at_5 + value: 8.08 + - type: recall_at_1 + value: 16.829 + - type: recall_at_10 + value: 37.407000000000004 + - type: recall_at_100 + value: 59.101000000000006 + - type: recall_at_1000 + value: 81.024 + - type: recall_at_3 + value: 26.739 + - type: recall_at_5 + value: 31.524 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackUnixRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 24.138 + - type: map_at_10 + value: 32.275999999999996 + - type: map_at_100 + value: 33.416000000000004 + - type: map_at_1000 + value: 33.527 + - type: map_at_3 + value: 29.854000000000003 + - type: map_at_5 + value: 31.096 + - type: mrr_at_1 + value: 28.450999999999997 + - type: mrr_at_10 + value: 36.214 + - type: mrr_at_100 + value: 37.134 + - type: mrr_at_1000 + value: 37.198 + - type: mrr_at_3 + value: 34.001999999999995 + - type: mrr_at_5 + value: 35.187000000000005 + - type: ndcg_at_1 + value: 28.450999999999997 + - type: ndcg_at_10 + value: 37.166 + - type: ndcg_at_100 + value: 42.454 + - type: ndcg_at_1000 + value: 44.976 + - type: ndcg_at_3 + value: 32.796 + - type: ndcg_at_5 + value: 34.631 + - type: precision_at_1 + value: 28.450999999999997 + - type: precision_at_10 + value: 6.241 + - type: precision_at_100 + value: 0.9950000000000001 + - type: precision_at_1000 + value: 0.133 + - type: precision_at_3 + value: 14.801 + - type: precision_at_5 + value: 10.280000000000001 + - type: recall_at_1 + value: 24.138 + - type: recall_at_10 + value: 48.111 + - type: recall_at_100 + value: 71.245 + - type: recall_at_1000 + value: 88.986 + - type: recall_at_3 + value: 36.119 + - type: recall_at_5 + value: 40.846 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackWebmastersRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 23.244 + - type: map_at_10 + value: 31.227 + - type: map_at_100 + value: 33.007 + - type: map_at_1000 + value: 33.223 + - type: map_at_3 + value: 28.924 + - type: map_at_5 + value: 30.017 + - type: mrr_at_1 + value: 27.668 + - type: mrr_at_10 + value: 35.524 + - type: mrr_at_100 + value: 36.699 + - type: mrr_at_1000 + value: 36.759 + - type: mrr_at_3 + value: 33.366 + - type: mrr_at_5 + value: 34.552 + - type: ndcg_at_1 + value: 27.668 + - type: ndcg_at_10 + value: 36.381 + - type: ndcg_at_100 + value: 43.062 + - type: ndcg_at_1000 + value: 45.656 + - type: ndcg_at_3 + value: 32.501999999999995 + - type: ndcg_at_5 + value: 34.105999999999995 + - type: precision_at_1 + value: 27.668 + - type: precision_at_10 + value: 6.798 + - type: precision_at_100 + value: 1.492 + - type: precision_at_1000 + value: 0.234 + - type: precision_at_3 + value: 15.152 + - type: precision_at_5 + value: 10.791 + - type: recall_at_1 + value: 23.244 + - type: recall_at_10 + value: 45.979 + - type: recall_at_100 + value: 74.822 + - type: recall_at_1000 + value: 91.078 + - type: recall_at_3 + value: 34.925 + - type: recall_at_5 + value: 39.126 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackWordpressRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 19.945 + - type: map_at_10 + value: 27.517999999999997 + - type: map_at_100 + value: 28.588 + - type: map_at_1000 + value: 28.682000000000002 + - type: map_at_3 + value: 25.345000000000002 + - type: map_at_5 + value: 26.555 + - type: mrr_at_1 + value: 21.996 + - type: mrr_at_10 + value: 29.845 + - type: mrr_at_100 + value: 30.775999999999996 + - type: mrr_at_1000 + value: 30.845 + - type: mrr_at_3 + value: 27.726 + - type: mrr_at_5 + value: 28.882 + - type: ndcg_at_1 + value: 21.996 + - type: ndcg_at_10 + value: 32.034 + - type: ndcg_at_100 + value: 37.185 + - type: ndcg_at_1000 + value: 39.645 + - type: ndcg_at_3 + value: 27.750999999999998 + - type: ndcg_at_5 + value: 29.805999999999997 + - type: precision_at_1 + value: 21.996 + - type: precision_at_10 + value: 5.065 + - type: precision_at_100 + value: 0.819 + - type: precision_at_1000 + value: 0.11399999999999999 + - type: precision_at_3 + value: 12.076 + - type: precision_at_5 + value: 8.392 + - type: recall_at_1 + value: 19.945 + - type: recall_at_10 + value: 43.62 + - type: recall_at_100 + value: 67.194 + - type: recall_at_1000 + value: 85.7 + - type: recall_at_3 + value: 32.15 + - type: recall_at_5 + value: 37.208999999999996 + - task: + type: Retrieval + dataset: + type: climate-fever + name: MTEB ClimateFEVER + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 18.279 + - type: map_at_10 + value: 31.052999999999997 + - type: map_at_100 + value: 33.125 + - type: map_at_1000 + value: 33.306000000000004 + - type: map_at_3 + value: 26.208 + - type: map_at_5 + value: 28.857 + - type: mrr_at_1 + value: 42.671 + - type: mrr_at_10 + value: 54.557 + - type: mrr_at_100 + value: 55.142 + - type: mrr_at_1000 + value: 55.169000000000004 + - type: mrr_at_3 + value: 51.488 + - type: mrr_at_5 + value: 53.439 + - type: ndcg_at_1 + value: 42.671 + - type: ndcg_at_10 + value: 41.276 + - type: ndcg_at_100 + value: 48.376000000000005 + - type: ndcg_at_1000 + value: 51.318 + - type: ndcg_at_3 + value: 35.068 + - type: ndcg_at_5 + value: 37.242 + - type: precision_at_1 + value: 42.671 + - type: precision_at_10 + value: 12.638 + - type: precision_at_100 + value: 2.045 + - type: precision_at_1000 + value: 0.26 + - type: precision_at_3 + value: 26.08 + - type: precision_at_5 + value: 19.805 + - type: recall_at_1 + value: 18.279 + - type: recall_at_10 + value: 46.946 + - type: recall_at_100 + value: 70.97200000000001 + - type: recall_at_1000 + value: 87.107 + - type: recall_at_3 + value: 31.147999999999996 + - type: recall_at_5 + value: 38.099 + - task: + type: Retrieval + dataset: + type: dbpedia-entity + name: MTEB DBPedia + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 8.573 + - type: map_at_10 + value: 19.747 + - type: map_at_100 + value: 28.205000000000002 + - type: map_at_1000 + value: 29.831000000000003 + - type: map_at_3 + value: 14.109 + - type: map_at_5 + value: 16.448999999999998 + - type: mrr_at_1 + value: 71 + - type: mrr_at_10 + value: 77.68599999999999 + - type: mrr_at_100 + value: 77.995 + - type: mrr_at_1000 + value: 78.00200000000001 + - type: mrr_at_3 + value: 76.292 + - type: mrr_at_5 + value: 77.029 + - type: ndcg_at_1 + value: 59.12500000000001 + - type: ndcg_at_10 + value: 43.9 + - type: ndcg_at_100 + value: 47.863 + - type: ndcg_at_1000 + value: 54.848 + - type: ndcg_at_3 + value: 49.803999999999995 + - type: ndcg_at_5 + value: 46.317 + - type: precision_at_1 + value: 71 + - type: precision_at_10 + value: 34.4 + - type: precision_at_100 + value: 11.063 + - type: precision_at_1000 + value: 1.989 + - type: precision_at_3 + value: 52.333 + - type: precision_at_5 + value: 43.7 + - type: recall_at_1 + value: 8.573 + - type: recall_at_10 + value: 25.615 + - type: recall_at_100 + value: 53.385000000000005 + - type: recall_at_1000 + value: 75.46000000000001 + - type: recall_at_3 + value: 15.429 + - type: recall_at_5 + value: 19.357 + - task: + type: Classification + dataset: + type: mteb/emotion + name: MTEB EmotionClassification + config: default + split: test + revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37 + metrics: + - type: accuracy + value: 47.989999999999995 + - type: f1 + value: 42.776314451497555 + - task: + type: Retrieval + dataset: + type: fever + name: MTEB FEVER + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 74.13499999999999 + - type: map_at_10 + value: 82.825 + - type: map_at_100 + value: 83.096 + - type: map_at_1000 + value: 83.111 + - type: map_at_3 + value: 81.748 + - type: map_at_5 + value: 82.446 + - type: mrr_at_1 + value: 79.553 + - type: mrr_at_10 + value: 86.654 + - type: mrr_at_100 + value: 86.774 + - type: mrr_at_1000 + value: 86.778 + - type: mrr_at_3 + value: 85.981 + - type: mrr_at_5 + value: 86.462 + - type: ndcg_at_1 + value: 79.553 + - type: ndcg_at_10 + value: 86.345 + - type: ndcg_at_100 + value: 87.32 + - type: ndcg_at_1000 + value: 87.58200000000001 + - type: ndcg_at_3 + value: 84.719 + - type: ndcg_at_5 + value: 85.677 + - type: precision_at_1 + value: 79.553 + - type: precision_at_10 + value: 10.402000000000001 + - type: precision_at_100 + value: 1.1119999999999999 + - type: precision_at_1000 + value: 0.11499999999999999 + - type: precision_at_3 + value: 32.413 + - type: precision_at_5 + value: 20.138 + - type: recall_at_1 + value: 74.13499999999999 + - type: recall_at_10 + value: 93.215 + - type: recall_at_100 + value: 97.083 + - type: recall_at_1000 + value: 98.732 + - type: recall_at_3 + value: 88.79 + - type: recall_at_5 + value: 91.259 + - task: + type: Retrieval + dataset: + type: fiqa + name: MTEB FiQA2018 + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 18.298000000000002 + - type: map_at_10 + value: 29.901 + - type: map_at_100 + value: 31.528 + - type: map_at_1000 + value: 31.713 + - type: map_at_3 + value: 25.740000000000002 + - type: map_at_5 + value: 28.227999999999998 + - type: mrr_at_1 + value: 36.728 + - type: mrr_at_10 + value: 45.401 + - type: mrr_at_100 + value: 46.27 + - type: mrr_at_1000 + value: 46.315 + - type: mrr_at_3 + value: 42.978 + - type: mrr_at_5 + value: 44.29 + - type: ndcg_at_1 + value: 36.728 + - type: ndcg_at_10 + value: 37.456 + - type: ndcg_at_100 + value: 43.832 + - type: ndcg_at_1000 + value: 47 + - type: ndcg_at_3 + value: 33.694 + - type: ndcg_at_5 + value: 35.085 + - type: precision_at_1 + value: 36.728 + - type: precision_at_10 + value: 10.386 + - type: precision_at_100 + value: 1.701 + - type: precision_at_1000 + value: 0.22599999999999998 + - type: precision_at_3 + value: 22.479 + - type: precision_at_5 + value: 16.605 + - type: recall_at_1 + value: 18.298000000000002 + - type: recall_at_10 + value: 44.369 + - type: recall_at_100 + value: 68.098 + - type: recall_at_1000 + value: 87.21900000000001 + - type: recall_at_3 + value: 30.215999999999998 + - type: recall_at_5 + value: 36.861 + - task: + type: Retrieval + dataset: + type: hotpotqa + name: MTEB HotpotQA + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 39.568 + - type: map_at_10 + value: 65.061 + - type: map_at_100 + value: 65.896 + - type: map_at_1000 + value: 65.95100000000001 + - type: map_at_3 + value: 61.831 + - type: map_at_5 + value: 63.849000000000004 + - type: mrr_at_1 + value: 79.136 + - type: mrr_at_10 + value: 84.58200000000001 + - type: mrr_at_100 + value: 84.765 + - type: mrr_at_1000 + value: 84.772 + - type: mrr_at_3 + value: 83.684 + - type: mrr_at_5 + value: 84.223 + - type: ndcg_at_1 + value: 79.136 + - type: ndcg_at_10 + value: 72.622 + - type: ndcg_at_100 + value: 75.539 + - type: ndcg_at_1000 + value: 76.613 + - type: ndcg_at_3 + value: 68.065 + - type: ndcg_at_5 + value: 70.58 + - type: precision_at_1 + value: 79.136 + - type: precision_at_10 + value: 15.215 + - type: precision_at_100 + value: 1.7500000000000002 + - type: precision_at_1000 + value: 0.189 + - type: precision_at_3 + value: 44.011 + - type: precision_at_5 + value: 28.388999999999996 + - type: recall_at_1 + value: 39.568 + - type: recall_at_10 + value: 76.077 + - type: recall_at_100 + value: 87.481 + - type: recall_at_1000 + value: 94.56400000000001 + - type: recall_at_3 + value: 66.01599999999999 + - type: recall_at_5 + value: 70.97200000000001 + - task: + type: Classification + dataset: + type: mteb/imdb + name: MTEB ImdbClassification + config: default + split: test + revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7 + metrics: + - type: accuracy + value: 85.312 + - type: ap + value: 80.36296867333715 + - type: f1 + value: 85.26613311552218 + - task: + type: Retrieval + dataset: + type: msmarco + name: MTEB MSMARCO + config: default + split: dev + revision: None + metrics: + - type: map_at_1 + value: 23.363999999999997 + - type: map_at_10 + value: 35.711999999999996 + - type: map_at_100 + value: 36.876999999999995 + - type: map_at_1000 + value: 36.923 + - type: map_at_3 + value: 32.034 + - type: map_at_5 + value: 34.159 + - type: mrr_at_1 + value: 24.04 + - type: mrr_at_10 + value: 36.345 + - type: mrr_at_100 + value: 37.441 + - type: mrr_at_1000 + value: 37.480000000000004 + - type: mrr_at_3 + value: 32.713 + - type: mrr_at_5 + value: 34.824 + - type: ndcg_at_1 + value: 24.026 + - type: ndcg_at_10 + value: 42.531 + - type: ndcg_at_100 + value: 48.081 + - type: ndcg_at_1000 + value: 49.213 + - type: ndcg_at_3 + value: 35.044 + - type: ndcg_at_5 + value: 38.834 + - type: precision_at_1 + value: 24.026 + - type: precision_at_10 + value: 6.622999999999999 + - type: precision_at_100 + value: 0.941 + - type: precision_at_1000 + value: 0.104 + - type: precision_at_3 + value: 14.909 + - type: precision_at_5 + value: 10.871 + - type: recall_at_1 + value: 23.363999999999997 + - type: recall_at_10 + value: 63.426 + - type: recall_at_100 + value: 88.96300000000001 + - type: recall_at_1000 + value: 97.637 + - type: recall_at_3 + value: 43.095 + - type: recall_at_5 + value: 52.178000000000004 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (en) + config: en + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 93.0095759233926 + - type: f1 + value: 92.78387794667408 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (en) + config: en + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 75.0296397628819 + - type: f1 + value: 58.45699589820874 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (en) + config: en + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 73.45662407531944 + - type: f1 + value: 71.42364781421813 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (en) + config: en + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.07800941492937 + - type: f1 + value: 77.22799045640845 + - task: + type: Clustering + dataset: + type: mteb/medrxiv-clustering-p2p + name: MTEB MedrxivClusteringP2P + config: default + split: test + revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73 + metrics: + - type: v_measure + value: 34.531234379250606 + - task: + type: Clustering + dataset: + type: mteb/medrxiv-clustering-s2s + name: MTEB MedrxivClusteringS2S + config: default + split: test + revision: 35191c8c0dca72d8ff3efcd72aa802307d469663 + metrics: + - type: v_measure + value: 30.941490381193802 + - task: + type: Reranking + dataset: + type: mteb/mind_small + name: MTEB MindSmallReranking + config: default + split: test + revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69 + metrics: + - type: map + value: 30.3115090856725 + - type: mrr + value: 31.290667638675757 + - task: + type: Retrieval + dataset: + type: nfcorpus + name: MTEB NFCorpus + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 5.465 + - type: map_at_10 + value: 13.03 + - type: map_at_100 + value: 16.057 + - type: map_at_1000 + value: 17.49 + - type: map_at_3 + value: 9.553 + - type: map_at_5 + value: 11.204 + - type: mrr_at_1 + value: 43.653 + - type: mrr_at_10 + value: 53.269 + - type: mrr_at_100 + value: 53.72 + - type: mrr_at_1000 + value: 53.761 + - type: mrr_at_3 + value: 50.929 + - type: mrr_at_5 + value: 52.461 + - type: ndcg_at_1 + value: 42.26 + - type: ndcg_at_10 + value: 34.673 + - type: ndcg_at_100 + value: 30.759999999999998 + - type: ndcg_at_1000 + value: 39.728 + - type: ndcg_at_3 + value: 40.349000000000004 + - type: ndcg_at_5 + value: 37.915 + - type: precision_at_1 + value: 43.653 + - type: precision_at_10 + value: 25.789 + - type: precision_at_100 + value: 7.754999999999999 + - type: precision_at_1000 + value: 2.07 + - type: precision_at_3 + value: 38.596000000000004 + - type: precision_at_5 + value: 33.251 + - type: recall_at_1 + value: 5.465 + - type: recall_at_10 + value: 17.148 + - type: recall_at_100 + value: 29.768 + - type: recall_at_1000 + value: 62.239 + - type: recall_at_3 + value: 10.577 + - type: recall_at_5 + value: 13.315 + - task: + type: Retrieval + dataset: + type: nq + name: MTEB NQ + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 37.008 + - type: map_at_10 + value: 52.467 + - type: map_at_100 + value: 53.342999999999996 + - type: map_at_1000 + value: 53.366 + - type: map_at_3 + value: 48.412 + - type: map_at_5 + value: 50.875 + - type: mrr_at_1 + value: 41.541 + - type: mrr_at_10 + value: 54.967 + - type: mrr_at_100 + value: 55.611 + - type: mrr_at_1000 + value: 55.627 + - type: mrr_at_3 + value: 51.824999999999996 + - type: mrr_at_5 + value: 53.763000000000005 + - type: ndcg_at_1 + value: 41.541 + - type: ndcg_at_10 + value: 59.724999999999994 + - type: ndcg_at_100 + value: 63.38700000000001 + - type: ndcg_at_1000 + value: 63.883 + - type: ndcg_at_3 + value: 52.331 + - type: ndcg_at_5 + value: 56.327000000000005 + - type: precision_at_1 + value: 41.541 + - type: precision_at_10 + value: 9.447 + - type: precision_at_100 + value: 1.1520000000000001 + - type: precision_at_1000 + value: 0.12 + - type: precision_at_3 + value: 23.262 + - type: precision_at_5 + value: 16.314999999999998 + - type: recall_at_1 + value: 37.008 + - type: recall_at_10 + value: 79.145 + - type: recall_at_100 + value: 94.986 + - type: recall_at_1000 + value: 98.607 + - type: recall_at_3 + value: 60.277 + - type: recall_at_5 + value: 69.407 + - task: + type: Retrieval + dataset: + type: quora + name: MTEB QuoraRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 70.402 + - type: map_at_10 + value: 84.181 + - type: map_at_100 + value: 84.796 + - type: map_at_1000 + value: 84.81400000000001 + - type: map_at_3 + value: 81.209 + - type: map_at_5 + value: 83.085 + - type: mrr_at_1 + value: 81.02000000000001 + - type: mrr_at_10 + value: 87.263 + - type: mrr_at_100 + value: 87.36 + - type: mrr_at_1000 + value: 87.36 + - type: mrr_at_3 + value: 86.235 + - type: mrr_at_5 + value: 86.945 + - type: ndcg_at_1 + value: 81.01 + - type: ndcg_at_10 + value: 87.99900000000001 + - type: ndcg_at_100 + value: 89.217 + - type: ndcg_at_1000 + value: 89.33 + - type: ndcg_at_3 + value: 85.053 + - type: ndcg_at_5 + value: 86.703 + - type: precision_at_1 + value: 81.01 + - type: precision_at_10 + value: 13.336 + - type: precision_at_100 + value: 1.52 + - type: precision_at_1000 + value: 0.156 + - type: precision_at_3 + value: 37.14 + - type: precision_at_5 + value: 24.44 + - type: recall_at_1 + value: 70.402 + - type: recall_at_10 + value: 95.214 + - type: recall_at_100 + value: 99.438 + - type: recall_at_1000 + value: 99.928 + - type: recall_at_3 + value: 86.75699999999999 + - type: recall_at_5 + value: 91.44099999999999 + - task: + type: Clustering + dataset: + type: mteb/reddit-clustering + name: MTEB RedditClustering + config: default + split: test + revision: 24640382cdbf8abc73003fb0fa6d111a705499eb + metrics: + - type: v_measure + value: 56.51721502758904 + - task: + type: Clustering + dataset: + type: mteb/reddit-clustering-p2p + name: MTEB RedditClusteringP2P + config: default + split: test + revision: 282350215ef01743dc01b456c7f5241fa8937f16 + metrics: + - type: v_measure + value: 61.054808572333016 + - task: + type: Retrieval + dataset: + type: scidocs + name: MTEB SCIDOCS + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 4.578 + - type: map_at_10 + value: 11.036999999999999 + - type: map_at_100 + value: 12.879999999999999 + - type: map_at_1000 + value: 13.150999999999998 + - type: map_at_3 + value: 8.133 + - type: map_at_5 + value: 9.559 + - type: mrr_at_1 + value: 22.6 + - type: mrr_at_10 + value: 32.68 + - type: mrr_at_100 + value: 33.789 + - type: mrr_at_1000 + value: 33.854 + - type: mrr_at_3 + value: 29.7 + - type: mrr_at_5 + value: 31.480000000000004 + - type: ndcg_at_1 + value: 22.6 + - type: ndcg_at_10 + value: 18.616 + - type: ndcg_at_100 + value: 25.883 + - type: ndcg_at_1000 + value: 30.944 + - type: ndcg_at_3 + value: 18.136 + - type: ndcg_at_5 + value: 15.625 + - type: precision_at_1 + value: 22.6 + - type: precision_at_10 + value: 9.48 + - type: precision_at_100 + value: 1.991 + - type: precision_at_1000 + value: 0.321 + - type: precision_at_3 + value: 16.8 + - type: precision_at_5 + value: 13.54 + - type: recall_at_1 + value: 4.578 + - type: recall_at_10 + value: 19.213 + - type: recall_at_100 + value: 40.397 + - type: recall_at_1000 + value: 65.2 + - type: recall_at_3 + value: 10.208 + - type: recall_at_5 + value: 13.718 + - task: + type: STS + dataset: + type: mteb/sickr-sts + name: MTEB SICK-R + config: default + split: test + revision: a6ea5a8cab320b040a23452cc28066d9beae2cee + metrics: + - type: cos_sim_pearson + value: 83.44288351714071 + - type: cos_sim_spearman + value: 79.37995604564952 + - type: euclidean_pearson + value: 81.1078874670718 + - type: euclidean_spearman + value: 79.37995905980499 + - type: manhattan_pearson + value: 81.03697527288986 + - type: manhattan_spearman + value: 79.33490235296236 + - task: + type: STS + dataset: + type: mteb/sts12-sts + name: MTEB STS12 + config: default + split: test + revision: a0d554a64d88156834ff5ae9920b964011b16384 + metrics: + - type: cos_sim_pearson + value: 84.95557650436523 + - type: cos_sim_spearman + value: 78.5190672399868 + - type: euclidean_pearson + value: 81.58064025904707 + - type: euclidean_spearman + value: 78.5190672399868 + - type: manhattan_pearson + value: 81.52857930619889 + - type: manhattan_spearman + value: 78.50421361308034 + - task: + type: STS + dataset: + type: mteb/sts13-sts + name: MTEB STS13 + config: default + split: test + revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca + metrics: + - type: cos_sim_pearson + value: 84.79128416228737 + - type: cos_sim_spearman + value: 86.05402451477147 + - type: euclidean_pearson + value: 85.46280267054289 + - type: euclidean_spearman + value: 86.05402451477147 + - type: manhattan_pearson + value: 85.46278563858236 + - type: manhattan_spearman + value: 86.08079590861004 + - task: + type: STS + dataset: + type: mteb/sts14-sts + name: MTEB STS14 + config: default + split: test + revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375 + metrics: + - type: cos_sim_pearson + value: 83.20623089568763 + - type: cos_sim_spearman + value: 81.53786907061009 + - type: euclidean_pearson + value: 82.82272250091494 + - type: euclidean_spearman + value: 81.53786907061009 + - type: manhattan_pearson + value: 82.78850494027013 + - type: manhattan_spearman + value: 81.5135618083407 + - task: + type: STS + dataset: + type: mteb/sts15-sts + name: MTEB STS15 + config: default + split: test + revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3 + metrics: + - type: cos_sim_pearson + value: 85.46366618397936 + - type: cos_sim_spearman + value: 86.96566013336908 + - type: euclidean_pearson + value: 86.62651697548931 + - type: euclidean_spearman + value: 86.96565526364454 + - type: manhattan_pearson + value: 86.58812160258009 + - type: manhattan_spearman + value: 86.9336484321288 + - task: + type: STS + dataset: + type: mteb/sts16-sts + name: MTEB STS16 + config: default + split: test + revision: 4d8694f8f0e0100860b497b999b3dbed754a0513 + metrics: + - type: cos_sim_pearson + value: 82.51858358641559 + - type: cos_sim_spearman + value: 84.7652527954999 + - type: euclidean_pearson + value: 84.23914783766861 + - type: euclidean_spearman + value: 84.7652527954999 + - type: manhattan_pearson + value: 84.22749648503171 + - type: manhattan_spearman + value: 84.74527996746386 + - task: + type: STS + dataset: + type: mteb/sts17-crosslingual-sts + name: MTEB STS17 (en-en) + config: en-en + split: test + revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d + metrics: + - type: cos_sim_pearson + value: 87.28026563313065 + - type: cos_sim_spearman + value: 87.46928143824915 + - type: euclidean_pearson + value: 88.30558762000372 + - type: euclidean_spearman + value: 87.46928143824915 + - type: manhattan_pearson + value: 88.10513330809331 + - type: manhattan_spearman + value: 87.21069787834173 + - task: + type: STS + dataset: + type: mteb/sts22-crosslingual-sts + name: MTEB STS22 (en) + config: en + split: test + revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80 + metrics: + - type: cos_sim_pearson + value: 62.376497134587375 + - type: cos_sim_spearman + value: 65.0159550112516 + - type: euclidean_pearson + value: 65.64572120879598 + - type: euclidean_spearman + value: 65.0159550112516 + - type: manhattan_pearson + value: 65.88143604989976 + - type: manhattan_spearman + value: 65.17547297222434 + - task: + type: STS + dataset: + type: mteb/stsbenchmark-sts + name: MTEB STSBenchmark + config: default + split: test + revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831 + metrics: + - type: cos_sim_pearson + value: 84.22876368947644 + - type: cos_sim_spearman + value: 85.46935577445318 + - type: euclidean_pearson + value: 85.32830231392005 + - type: euclidean_spearman + value: 85.46935577445318 + - type: manhattan_pearson + value: 85.30353211758495 + - type: manhattan_spearman + value: 85.42821085956945 + - task: + type: Reranking + dataset: + type: mteb/scidocs-reranking + name: MTEB SciDocsRR + config: default + split: test + revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab + metrics: + - type: map + value: 80.60986667767133 + - type: mrr + value: 94.29432314236236 + - task: + type: Retrieval + dataset: + type: scifact + name: MTEB SciFact + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 54.528 + - type: map_at_10 + value: 65.187 + - type: map_at_100 + value: 65.62599999999999 + - type: map_at_1000 + value: 65.657 + - type: map_at_3 + value: 62.352 + - type: map_at_5 + value: 64.025 + - type: mrr_at_1 + value: 57.333 + - type: mrr_at_10 + value: 66.577 + - type: mrr_at_100 + value: 66.88 + - type: mrr_at_1000 + value: 66.908 + - type: mrr_at_3 + value: 64.556 + - type: mrr_at_5 + value: 65.739 + - type: ndcg_at_1 + value: 57.333 + - type: ndcg_at_10 + value: 70.275 + - type: ndcg_at_100 + value: 72.136 + - type: ndcg_at_1000 + value: 72.963 + - type: ndcg_at_3 + value: 65.414 + - type: ndcg_at_5 + value: 67.831 + - type: precision_at_1 + value: 57.333 + - type: precision_at_10 + value: 9.5 + - type: precision_at_100 + value: 1.057 + - type: precision_at_1000 + value: 0.11199999999999999 + - type: precision_at_3 + value: 25.778000000000002 + - type: precision_at_5 + value: 17.2 + - type: recall_at_1 + value: 54.528 + - type: recall_at_10 + value: 84.356 + - type: recall_at_100 + value: 92.833 + - type: recall_at_1000 + value: 99.333 + - type: recall_at_3 + value: 71.283 + - type: recall_at_5 + value: 77.14999999999999 + - task: + type: PairClassification + dataset: + type: mteb/sprintduplicatequestions-pairclassification + name: MTEB SprintDuplicateQuestions + config: default + split: test + revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46 + metrics: + - type: cos_sim_accuracy + value: 99.74158415841585 + - type: cos_sim_ap + value: 92.90048959850317 + - type: cos_sim_f1 + value: 86.35650810245687 + - type: cos_sim_precision + value: 90.4709748083242 + - type: cos_sim_recall + value: 82.6 + - type: dot_accuracy + value: 99.74158415841585 + - type: dot_ap + value: 92.90048959850317 + - type: dot_f1 + value: 86.35650810245687 + - type: dot_precision + value: 90.4709748083242 + - type: dot_recall + value: 82.6 + - type: euclidean_accuracy + value: 99.74158415841585 + - type: euclidean_ap + value: 92.90048959850317 + - type: euclidean_f1 + value: 86.35650810245687 + - type: euclidean_precision + value: 90.4709748083242 + - type: euclidean_recall + value: 82.6 + - type: manhattan_accuracy + value: 99.74158415841585 + - type: manhattan_ap + value: 92.87344692947894 + - type: manhattan_f1 + value: 86.38497652582159 + - type: manhattan_precision + value: 90.29443838604145 + - type: manhattan_recall + value: 82.8 + - type: max_accuracy + value: 99.74158415841585 + - type: max_ap + value: 92.90048959850317 + - type: max_f1 + value: 86.38497652582159 + - task: + type: Clustering + dataset: + type: mteb/stackexchange-clustering + name: MTEB StackExchangeClustering + config: default + split: test + revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259 + metrics: + - type: v_measure + value: 63.191648770424216 + - task: + type: Clustering + dataset: + type: mteb/stackexchange-clustering-p2p + name: MTEB StackExchangeClusteringP2P + config: default + split: test + revision: 815ca46b2622cec33ccafc3735d572c266efdb44 + metrics: + - type: v_measure + value: 34.02944668730218 + - task: + type: Reranking + dataset: + type: mteb/stackoverflowdupquestions-reranking + name: MTEB StackOverflowDupQuestions + config: default + split: test + revision: e185fbe320c72810689fc5848eb6114e1ef5ec69 + metrics: + - type: map + value: 50.466386167525265 + - type: mrr + value: 51.19071492233257 + - task: + type: Summarization + dataset: + type: mteb/summeval + name: MTEB SummEval + config: default + split: test + revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c + metrics: + - type: cos_sim_pearson + value: 30.198022505886435 + - type: cos_sim_spearman + value: 30.40170257939193 + - type: dot_pearson + value: 30.198015316402614 + - type: dot_spearman + value: 30.40170257939193 + - task: + type: Retrieval + dataset: + type: trec-covid + name: MTEB TRECCOVID + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 0.242 + - type: map_at_10 + value: 2.17 + - type: map_at_100 + value: 12.221 + - type: map_at_1000 + value: 28.63 + - type: map_at_3 + value: 0.728 + - type: map_at_5 + value: 1.185 + - type: mrr_at_1 + value: 94 + - type: mrr_at_10 + value: 97 + - type: mrr_at_100 + value: 97 + - type: mrr_at_1000 + value: 97 + - type: mrr_at_3 + value: 97 + - type: mrr_at_5 + value: 97 + - type: ndcg_at_1 + value: 89 + - type: ndcg_at_10 + value: 82.30499999999999 + - type: ndcg_at_100 + value: 61.839999999999996 + - type: ndcg_at_1000 + value: 53.381 + - type: ndcg_at_3 + value: 88.877 + - type: ndcg_at_5 + value: 86.05199999999999 + - type: precision_at_1 + value: 94 + - type: precision_at_10 + value: 87 + - type: precision_at_100 + value: 63.38 + - type: precision_at_1000 + value: 23.498 + - type: precision_at_3 + value: 94 + - type: precision_at_5 + value: 92 + - type: recall_at_1 + value: 0.242 + - type: recall_at_10 + value: 2.302 + - type: recall_at_100 + value: 14.979000000000001 + - type: recall_at_1000 + value: 49.638 + - type: recall_at_3 + value: 0.753 + - type: recall_at_5 + value: 1.226 + - task: + type: Retrieval + dataset: + type: webis-touche2020 + name: MTEB Touche2020 + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 3.006 + - type: map_at_10 + value: 11.805 + - type: map_at_100 + value: 18.146 + - type: map_at_1000 + value: 19.788 + - type: map_at_3 + value: 5.914 + - type: map_at_5 + value: 8.801 + - type: mrr_at_1 + value: 40.816 + - type: mrr_at_10 + value: 56.36600000000001 + - type: mrr_at_100 + value: 56.721999999999994 + - type: mrr_at_1000 + value: 56.721999999999994 + - type: mrr_at_3 + value: 52.041000000000004 + - type: mrr_at_5 + value: 54.796 + - type: ndcg_at_1 + value: 37.755 + - type: ndcg_at_10 + value: 29.863 + - type: ndcg_at_100 + value: 39.571 + - type: ndcg_at_1000 + value: 51.385999999999996 + - type: ndcg_at_3 + value: 32.578 + - type: ndcg_at_5 + value: 32.351 + - type: precision_at_1 + value: 40.816 + - type: precision_at_10 + value: 26.531 + - type: precision_at_100 + value: 7.796 + - type: precision_at_1000 + value: 1.555 + - type: precision_at_3 + value: 32.653 + - type: precision_at_5 + value: 33.061 + - type: recall_at_1 + value: 3.006 + - type: recall_at_10 + value: 18.738 + - type: recall_at_100 + value: 48.058 + - type: recall_at_1000 + value: 83.41300000000001 + - type: recall_at_3 + value: 7.166 + - type: recall_at_5 + value: 12.102 + - task: + type: Classification + dataset: + type: mteb/toxic_conversations_50k + name: MTEB ToxicConversationsClassification + config: default + split: test + revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c + metrics: + - type: accuracy + value: 71.4178 + - type: ap + value: 14.648781342150446 + - type: f1 + value: 55.07299194946378 + - task: + type: Classification + dataset: + type: mteb/tweet_sentiment_extraction + name: MTEB TweetSentimentExtractionClassification + config: default + split: test + revision: d604517c81ca91fe16a244d1248fc021f9ecee7a + metrics: + - type: accuracy + value: 60.919637804187886 + - type: f1 + value: 61.24122013967399 + - task: + type: Clustering + dataset: + type: mteb/twentynewsgroups-clustering + name: MTEB TwentyNewsgroupsClustering + config: default + split: test + revision: 6125ec4e24fa026cec8a478383ee943acfbd5449 + metrics: + - type: v_measure + value: 49.207896583685695 + - task: + type: PairClassification + dataset: + type: mteb/twittersemeval2015-pairclassification + name: MTEB TwitterSemEval2015 + config: default + split: test + revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1 + metrics: + - type: cos_sim_accuracy + value: 86.23114978840078 + - type: cos_sim_ap + value: 74.26624727825818 + - type: cos_sim_f1 + value: 68.72377190817083 + - type: cos_sim_precision + value: 64.56400742115028 + - type: cos_sim_recall + value: 73.45646437994723 + - type: dot_accuracy + value: 86.23114978840078 + - type: dot_ap + value: 74.26624032659652 + - type: dot_f1 + value: 68.72377190817083 + - type: dot_precision + value: 64.56400742115028 + - type: dot_recall + value: 73.45646437994723 + - type: euclidean_accuracy + value: 86.23114978840078 + - type: euclidean_ap + value: 74.26624714480556 + - type: euclidean_f1 + value: 68.72377190817083 + - type: euclidean_precision + value: 64.56400742115028 + - type: euclidean_recall + value: 73.45646437994723 + - type: manhattan_accuracy + value: 86.16558383501221 + - type: manhattan_ap + value: 74.2091943976357 + - type: manhattan_f1 + value: 68.64221520524654 + - type: manhattan_precision + value: 63.59135913591359 + - type: manhattan_recall + value: 74.5646437994723 + - type: max_accuracy + value: 86.23114978840078 + - type: max_ap + value: 74.26624727825818 + - type: max_f1 + value: 68.72377190817083 + - task: + type: PairClassification + dataset: + type: mteb/twitterurlcorpus-pairclassification + name: MTEB TwitterURLCorpus + config: default + split: test + revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf + metrics: + - type: cos_sim_accuracy + value: 89.3681841114604 + - type: cos_sim_ap + value: 86.65166387498546 + - type: cos_sim_f1 + value: 79.02581944698774 + - type: cos_sim_precision + value: 75.35796605434099 + - type: cos_sim_recall + value: 83.06898675700647 + - type: dot_accuracy + value: 89.3681841114604 + - type: dot_ap + value: 86.65166019802056 + - type: dot_f1 + value: 79.02581944698774 + - type: dot_precision + value: 75.35796605434099 + - type: dot_recall + value: 83.06898675700647 + - type: euclidean_accuracy + value: 89.3681841114604 + - type: euclidean_ap + value: 86.65166462876266 + - type: euclidean_f1 + value: 79.02581944698774 + - type: euclidean_precision + value: 75.35796605434099 + - type: euclidean_recall + value: 83.06898675700647 + - type: manhattan_accuracy + value: 89.36624364497226 + - type: manhattan_ap + value: 86.65076471274106 + - type: manhattan_f1 + value: 79.07408783532733 + - type: manhattan_precision + value: 76.41102972856527 + - type: manhattan_recall + value: 81.92947336002464 + - type: max_accuracy + value: 89.3681841114604 + - type: max_ap + value: 86.65166462876266 + - type: max_f1 + value: 79.07408783532733 +license: apache-2.0 +language: +- en --- -# SetFit with nomic-ai/nomic-embed-text-v1.5 - -This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) as the Sentence Transformer embedding model. A [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance is used for classification. - -The model has been trained using an efficient few-shot learning technique that involves: - -1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning. -2. Training a classification head with features from the fine-tuned Sentence Transformer. - -## Model Details - -### Model Description -- **Model Type:** SetFit -- **Sentence Transformer body:** [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) -- **Classification head:** a [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance -- **Maximum Sequence Length:** 8192 tokens -- **Number of Classes:** 63 classes - - - - -### Model Sources - -- **Repository:** [SetFit on GitHub](https://github.com/huggingface/setfit) -- **Paper:** [Efficient Few-Shot Learning Without Prompts](https://arxiv.org/abs/2209.11055) -- **Blogpost:** [SetFit: Efficient Few-Shot Learning Without Prompts](https://huggingface.co/blog/setfit) - -### Model Labels -| Label | Examples | -|:---------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| setBorderStyle | | -| setChildIndex | | -| undo | | -| setTextListStyle | | -| align | | -| resetImageSettings | | -| rotate | | -| ungroup | | -| setBlendMode | | -| addText | | -| setParagraphSpacing | | -| addAsset | | -| addAssetsUIAction | | -| redo | | -| applyAnimationObjectPreset | | -| moveRelativeTo | | -| resizePage | | -| setBlur | | -| group | | -| displayHelpQA | | -| applyPageLevelAnimation | | -| duplicatePage | | -| setBackgroundColor | | -| setLetterSpacing | | -| setWarmth | | -| setSaturation | | -| replaceText | | -| setLineSpacing | | -| setSharpness | | -| delete | | -| setContrast | | -| setTextLayout | | -| generateImageFromText | | -| flip | | -| generateTemplates | | -| setHighlight | | -| removeBackground | | -| fixTypography | | -| setBrightness | | -| changeContentInImage | | -| setImageAsBackground | | -| resize | | -| setFontSize | | -| showUploadUI | | -| replaceAsset | | -| distributeAroundObject | | -| select | | -| setOpacity | | -| setDropShadow | | -| proposeAssets | | -| setShadow | | -| setCropShape | | -| setTextAlignment | | -| setFontStyle | | -| duplicateSceneNode | | -| setFontType | | -| setTextDecorationEffect | | -| scatter | | -| removeContentFromImage | | -| setFillColor | | -| setTextShadow | | -| previewFilterEffects | | -| duplicateSceneNodeToPage | | - -## Evaluation - -### Metrics -| Label | Accuracy | -|:--------|:---------| -| **all** | 0.5701 | - -## Uses - -### Direct Use for Inference - -First install the SetFit library: +# nomic-embed-text-v1.5: Resizable Production Embeddings with Matryoshka Representation Learning -```bash -pip install setfit +[Blog](https://www.nomic.ai/blog/posts/nomic-embed-text-v1) | [Technical Report](https://arxiv.org/abs/2402.01613) | [AWS SageMaker](https://aws.amazon.com/marketplace/seller-profile?id=seller-tpqidcj54zawi) | [Nomic Platform](https://atlas.nomic.ai) + +**Exciting Update!**: `nomic-embed-text-v1.5` is now multimodal! [nomic-embed-vision-v1.5](https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5) is aligned to the embedding space of `nomic-embed-text-v1.5`, meaning any text embedding is multimodal! + +## Usage + +**Important**: the text prompt *must* include a *task instruction prefix*, instructing the model which task is being performed. + +For example, if you are implementing a RAG application, you embed your documents as `search_document: ` and embed your user queries as `search_query: `. + +## Task instruction prefixes + +### `search_document` + +#### Purpose: embed texts as documents from a dataset + +This prefix is used for embedding texts as documents, for example as documents for a RAG index. + +```python +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) +sentences = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten'] +embeddings = model.encode(sentences) +print(embeddings) ``` -Then you can load this model and run inference. +### `search_query` + +#### Purpose: embed texts as questions to answer + +This prefix is used for embedding texts as questions that documents from a dataset could resolve, for example as queries to be answered by a RAG application. ```python -from setfit import SetFitModel +from sentence_transformers import SentenceTransformer -# Download from the 🤗 Hub -model = SetFitModel.from_pretrained("setfit_model_id") -# Run inference -preds = model("Make the photo lighter") +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) +sentences = ['search_query: Who is Laurens van Der Maaten?'] +embeddings = model.encode(sentences) +print(embeddings) ``` - - - - - - - - -## Training Details - -### Training Set Metrics -| Training set | Min | Median | Max | -|:-------------|:----|:-------|:----| -| Word count | 3 | 5.2063 | 11 | - -| Label | Training Sample Count | -|:---------------------------|:----------------------| -| addAsset | 8 | -| addAssetsUIAction | 8 | -| addText | 8 | -| align | 8 | -| applyAnimationObjectPreset | 8 | -| applyPageLevelAnimation | 8 | -| changeContentInImage | 8 | -| delete | 8 | -| displayHelpQA | 8 | -| distributeAroundObject | 8 | -| duplicatePage | 8 | -| duplicateSceneNode | 8 | -| duplicateSceneNodeToPage | 8 | -| fixTypography | 8 | -| flip | 8 | -| generateImageFromText | 8 | -| generateTemplates | 8 | -| group | 8 | -| moveRelativeTo | 8 | -| previewFilterEffects | 8 | -| proposeAssets | 8 | -| redo | 8 | -| removeBackground | 8 | -| removeContentFromImage | 8 | -| replaceAsset | 8 | -| replaceText | 8 | -| resetImageSettings | 8 | -| resize | 8 | -| resizePage | 8 | -| rotate | 8 | -| scatter | 8 | -| select | 8 | -| setBackgroundColor | 8 | -| setBlendMode | 8 | -| setBlur | 8 | -| setBorderStyle | 8 | -| setBrightness | 8 | -| setChildIndex | 8 | -| setContrast | 8 | -| setCropShape | 8 | -| setDropShadow | 8 | -| setFillColor | 8 | -| setFontSize | 8 | -| setFontStyle | 8 | -| setFontType | 8 | -| setHighlight | 8 | -| setImageAsBackground | 8 | -| setLetterSpacing | 8 | -| setLineSpacing | 8 | -| setOpacity | 8 | -| setParagraphSpacing | 8 | -| setSaturation | 8 | -| setShadow | 8 | -| setSharpness | 8 | -| setTextAlignment | 8 | -| setTextDecorationEffect | 8 | -| setTextLayout | 8 | -| setTextListStyle | 8 | -| setTextShadow | 8 | -| setWarmth | 8 | -| showUploadUI | 8 | -| undo | 8 | -| ungroup | 8 | - -### Training Hyperparameters -- batch_size: (64, 64) -- num_epochs: (1, 1) -- max_steps: -1 -- sampling_strategy: oversampling -- body_learning_rate: (2e-05, 1e-05) -- head_learning_rate: 0.01 -- loss: CosineSimilarityLoss -- distance_metric: cosine_distance -- margin: 0.25 -- end_to_end: False -- use_amp: False -- warmup_proportion: 0.1 -- l2_weight: 0.01 -- seed: 42 -- eval_max_steps: -1 -- load_best_model_at_end: False - -### Training Results -| Epoch | Step | Training Loss | Validation Loss | -|:------:|:----:|:-------------:|:---------------:| -| 0.0003 | 1 | 0.1733 | - | -| 0.0128 | 50 | 0.1425 | - | -| 0.0256 | 100 | 0.0954 | - | -| 0.0384 | 150 | 0.0483 | - | -| 0.0512 | 200 | 0.0297 | - | -| 0.0640 | 250 | 0.0189 | - | -| 0.0768 | 300 | 0.0122 | - | -| 0.0896 | 350 | 0.0099 | - | -| 0.1024 | 400 | 0.0085 | - | -| 0.1152 | 450 | 0.0067 | - | -| 0.1280 | 500 | 0.0059 | - | -| 0.1408 | 550 | 0.0076 | - | -| 0.1536 | 600 | 0.0061 | - | -| 0.1664 | 650 | 0.0049 | - | -| 0.1792 | 700 | 0.003 | - | -| 0.1920 | 750 | 0.0023 | - | -| 0.2048 | 800 | 0.002 | - | -| 0.2176 | 850 | 0.0037 | - | -| 0.2304 | 900 | 0.0007 | - | -| 0.2432 | 950 | 0.0015 | - | -| 0.2560 | 1000 | 0.0017 | - | -| 0.2688 | 1050 | 0.0014 | - | -| 0.2816 | 1100 | 0.0003 | - | -| 0.2944 | 1150 | 0.0006 | - | -| 0.3072 | 1200 | 0.001 | - | -| 0.3200 | 1250 | 0.0007 | - | -| 0.3328 | 1300 | 0.0002 | - | -| 0.3456 | 1350 | 0.0012 | - | -| 0.3584 | 1400 | 0.0006 | - | -| 0.3712 | 1450 | 0.0003 | - | -| 0.3840 | 1500 | 0.0005 | - | -| 0.3968 | 1550 | 0.0002 | - | -| 0.4096 | 1600 | 0.0004 | - | -| 0.4224 | 1650 | 0.0009 | - | -| 0.4352 | 1700 | 0.0007 | - | -| 0.4480 | 1750 | 0.0003 | - | -| 0.4608 | 1800 | 0.0001 | - | -| 0.4736 | 1850 | 0.0006 | - | -| 0.4864 | 1900 | 0.0004 | - | -| 0.4992 | 1950 | 0.0004 | - | -| 0.5120 | 2000 | 0.0001 | - | -| 0.5248 | 2050 | 0.0004 | - | -| 0.5376 | 2100 | 0.0003 | - | -| 0.5504 | 2150 | 0.0012 | - | -| 0.5632 | 2200 | 0.0006 | - | -| 0.5760 | 2250 | 0.0001 | - | -| 0.5888 | 2300 | 0.0003 | - | -| 0.6016 | 2350 | 0.0007 | - | -| 0.6144 | 2400 | 0.0003 | - | -| 0.6272 | 2450 | 0.0004 | - | -| 0.6400 | 2500 | 0.0006 | - | -| 0.6528 | 2550 | 0.0001 | - | -| 0.6656 | 2600 | 0.0003 | - | -| 0.6784 | 2650 | 0.0006 | - | -| 0.6912 | 2700 | 0.001 | - | -| 0.7040 | 2750 | 0.0004 | - | -| 0.7168 | 2800 | 0.0008 | - | -| 0.7296 | 2850 | 0.0003 | - | -| 0.7424 | 2900 | 0.0007 | - | -| 0.7552 | 2950 | 0.0007 | - | -| 0.7680 | 3000 | 0.0007 | - | -| 0.7808 | 3050 | 0.0003 | - | -| 0.7937 | 3100 | 0.0003 | - | -| 0.8065 | 3150 | 0.0003 | - | -| 0.8193 | 3200 | 0.0007 | - | -| 0.8321 | 3250 | 0.0001 | - | -| 0.8449 | 3300 | 0.0006 | - | -| 0.8577 | 3350 | 0.0007 | - | -| 0.8705 | 3400 | 0.0003 | - | -| 0.8833 | 3450 | 0.0009 | - | -| 0.8961 | 3500 | 0.0003 | - | -| 0.9089 | 3550 | 0.0004 | - | -| 0.9217 | 3600 | 0.0004 | - | -| 0.9345 | 3650 | 0.0001 | - | -| 0.9473 | 3700 | 0.0001 | - | -| 0.9601 | 3750 | 0.0007 | - | -| 0.9729 | 3800 | 0.0006 | - | -| 0.9857 | 3850 | 0.0005 | - | -| 0.9985 | 3900 | 0.0003 | - | - -### Framework Versions -- Python: 3.12.11 -- SetFit: 1.1.3 -- Sentence Transformers: 5.1.0 -- Transformers: 4.54.1 -- PyTorch: 2.7.1 -- Datasets: 4.0.0 -- Tokenizers: 0.21.4 - -## Citation - -### BibTeX -```bibtex -@article{https://doi.org/10.48550/arxiv.2209.11055, - doi = {10.48550/ARXIV.2209.11055}, - url = {https://arxiv.org/abs/2209.11055}, - author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren}, - keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Efficient Few-Shot Learning Without Prompts}, - publisher = {arXiv}, - year = {2022}, - copyright = {Creative Commons Attribution 4.0 International} -} +### `clustering` + +#### Purpose: embed texts to group them into clusters + +This prefix is used for embedding texts in order to group them into clusters, discover common topics, or remove semantic duplicates. + +```python +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) +sentences = ['clustering: the quick brown fox'] +embeddings = model.encode(sentences) +print(embeddings) +``` + +### `classification` + +#### Purpose: embed texts to classify them + +This prefix is used for embedding texts into vectors that will be used as features for a classification model + +```python +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) +sentences = ['classification: the quick brown fox'] +embeddings = model.encode(sentences) +print(embeddings) ``` - +### Sentence Transformers +```python +import torch.nn.functional as F +from sentence_transformers import SentenceTransformer + +matryoshka_dim = 512 + +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) +sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?'] +embeddings = model.encode(sentences, convert_to_tensor=True) +embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) +embeddings = embeddings[:, :matryoshka_dim] +embeddings = F.normalize(embeddings, p=2, dim=1) +print(embeddings) +``` + +### Transformers + +```diff +import torch +import torch.nn.functional as F +from transformers import AutoTokenizer, AutoModel + +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + +sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?'] + +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') +model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, safe_serialization=True) +model.eval() + +encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') + ++ matryoshka_dim = 512 + +with torch.no_grad(): + model_output = model(**encoded_input) + +embeddings = mean_pooling(model_output, encoded_input['attention_mask']) ++ embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) ++ embeddings = embeddings[:, :matryoshka_dim] +embeddings = F.normalize(embeddings, p=2, dim=1) +print(embeddings) +``` - +```diff +- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') ++ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192) - \ No newline at end of file +- model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True) ++ model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, rotary_scaling_factor=2) +``` + +### Transformers.js + +```js +import { pipeline, layer_norm } from '@huggingface/transformers'; + +// Create a feature extraction pipeline +const extractor = await pipeline('feature-extraction', 'nomic-ai/nomic-embed-text-v1.5'); + +// Define sentences +const texts = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']; + +// Compute sentence embeddings +let embeddings = await extractor(texts, { pooling: 'mean' }); +console.log(embeddings); // Tensor of shape [2, 768] + +const matryoshka_dim = 512; +embeddings = layer_norm(embeddings, [embeddings.dims[1]]) + .slice(null, [0, matryoshka_dim]) + .normalize(2, -1); +console.log(embeddings.tolist()); +``` + + +## Nomic API + +The easiest way to use Nomic Embed is through the Nomic Embedding API. + +Generating embeddings with the `nomic` Python client is as easy as + +```python +from nomic import embed + +output = embed.text( + texts=['Nomic Embedding API', '#keepAIOpen'], + model='nomic-embed-text-v1.5', + task_type='search_document', + dimensionality=256, +) + +print(output) +``` + +For more information, see the [API reference](https://docs.nomic.ai/reference/endpoints/nomic-embed-text) + + +## Infinity + +Usage with [Infinity](https://github.com/michaelfeil/infinity). + +```bash +docker run --gpus all -v $PWD/data:/app/.cache -e HF_TOKEN=$HF_TOKEN -p "7997":"7997" \ +michaelf34/infinity:0.0.70 \ +v2 --model-id nomic-ai/nomic-embed-text-v1.5 --revision "main" --dtype float16 --batch-size 8 --engine torch --port 7997 --no-bettertransformer +``` + +## Adjusting Dimensionality + +`nomic-embed-text-v1.5` is an improvement upon [Nomic Embed](https://huggingface.co/nomic-ai/nomic-embed-text-v1) that utilizes [Matryoshka Representation Learning](https://arxiv.org/abs/2205.13147) which gives developers the flexibility to trade off the embedding size for a negligible reduction in performance. + + +| Name | SeqLen | Dimension | MTEB | +| :-------------------------------:| :----- | :-------- | :------: | +| nomic-embed-text-v1 | 8192 | 768 | **62.39** | +| nomic-embed-text-v1.5 | 8192 | 768 | 62.28 | +| nomic-embed-text-v1.5 | 8192 | 512 | 61.96 | +| nomic-embed-text-v1.5 | 8192 | 256 | 61.04 | +| nomic-embed-text-v1.5 | 8192 | 128 | 59.34 | +| nomic-embed-text-v1.5 | 8192 | 64 | 56.10 | + + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/607997c83a565c15675055b3/CRnaHV-c2wMUMZKw72q85.png) + +## Training +Click the Nomic Atlas map below to visualize a 5M sample of our contrastive pretraining data! + +[![image/webp](https://cdn-uploads.huggingface.co/production/uploads/607997c83a565c15675055b3/pjhJhuNyRfPagRd_c_iUz.webp)](https://atlas.nomic.ai/map/nomic-text-embed-v1-5m-sample) + +We train our embedder using a multi-stage training pipeline. Starting from a long-context [BERT model](https://huggingface.co/nomic-ai/nomic-bert-2048), +the first unsupervised contrastive stage trains on a dataset generated from weakly related text pairs, such as question-answer pairs from forums like StackExchange and Quora, title-body pairs from Amazon reviews, and summarizations from news articles. + +In the second finetuning stage, higher quality labeled datasets such as search queries and answers from web searches are leveraged. Data curation and hard-example mining is crucial in this stage. + +For more details, see the Nomic Embed [Technical Report](https://static.nomic.ai/reports/2024_Nomic_Embed_Text_Technical_Report.pdf) and corresponding [blog post](https://blog.nomic.ai/posts/nomic-embed-matryoshka). + +Training data to train the models is released in its entirety. For more details, see the `contrastors` [repository](https://github.com/nomic-ai/contrastors) + + +# Join the Nomic Community + +- Nomic: [https://nomic.ai](https://nomic.ai) +- Discord: [https://discord.gg/myY5YDR8z8](https://discord.gg/myY5YDR8z8) +- Twitter: [https://twitter.com/nomic_ai](https://twitter.com/nomic_ai) + + +# Citation + +If you find the model, dataset, or training code useful, please cite our work + +```bibtex +@misc{nussbaum2024nomic, + title={Nomic Embed: Training a Reproducible Long Context Text Embedder}, + author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar}, + year={2024}, + eprint={2402.01613}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` \ No newline at end of file