Skip to content

Text Model

  • Number of models: 225

Instruction Model

Alibaba-NLP/gte-Qwen1.5-7B-instruct

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.7B 28.8 GB 2024-04-20 eng-Latn
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

Alibaba-NLP/gte-Qwen2-1.5B-instruct

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 8960 1.8B 6.6 GB 2024-07-29 eng-Latn
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

Alibaba-NLP/gte-Qwen2-7B-instruct

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 7.6B 28.4 GB 2024-06-15 not specified
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

BAAI/bge-base-en

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 390.0 MB 2023-08-05 eng-Latn
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-base-en-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 390.0 MB 2023-09-11 eng-Latn
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-base-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 390.0 MB 2023-08-05 zho-Hans
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-base-zh-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 416.0 MB 2023-09-11 zho-Hans
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-large-en

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2023-08-05 eng-Latn
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-large-en-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2023-09-12 eng-Latn
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-large-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2023-08-02 zho-Hans
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-large-zh-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2023-09-12 zho-Hans
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-small-en

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 33.4M 127.0 MB 2023-08-05 eng-Latn
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-small-en-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 33.4M 127.0 MB 2023-09-12 eng-Latn
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-small-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 33.4M 127.0 MB 2023-08-05 zho-Hans
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-small-zh-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 33.4M 91.0 MB 2023-09-12 zho-Hans
Citation
@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BMRetriever/BMRetriever-1B

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 2048 908.8M 3.4 GB 2024-04-29 eng-Latn
Citation
@inproceedings{xu-etal-2024-bmretriever,
    title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
    author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = "November",
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    pages = "22234--22254",
    url = "https://aclanthology.org/2024.emnlp-main.1241/"
}

BMRetriever/BMRetriever-2B

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2048 2.5B 9.3 GB 2024-04-29 eng-Latn
Citation
@inproceedings{xu-etal-2024-bmretriever,
    title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
    author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = "November",
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    pages = "22234--22254",
    url = "https://aclanthology.org/2024.emnlp-main.1241/"
}

BMRetriever/BMRetriever-410M

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 1024 353.8M 1.3 GB 2024-04-29 eng-Latn
Citation
@inproceedings{xu-etal-2024-bmretriever,
    title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
    author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = "November",
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    pages = "22234--22254",
    url = "https://aclanthology.org/2024.emnlp-main.1241/"
}

BMRetriever/BMRetriever-7B

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-29 eng-Latn
Citation
@inproceedings{xu-etal-2024-bmretriever,
    title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
    author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = "November",
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    pages = "22234--22254",
    url = "https://aclanthology.org/2024.emnlp-main.1241/"
}

BeastyZ/e5-R-mistral-7b

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.2B 27.0 GB 2024-06-28 eng-Latn

ByteDance-Seed/Seed1.5-Embedding

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2048 not specified not specified 2025-04-25 eng-Latn, zho-Hans

Bytedance/Seed1.6-embedding

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2048 not specified not specified 2025-06-18 eng-Latn, zho-Hans

Bytedance/Seed1.6-embedding-1215

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2048 not specified not specified 2025-12-15 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)

Cohere/Cohere-embed-english-light-v3.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 not specified not specified 2023-11-02 eng-Latn

Cohere/Cohere-embed-english-v3.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 not specified not specified 2023-11-02 eng-Latn

Cohere/Cohere-embed-multilingual-light-v3.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 not specified not specified 2023-11-02 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

Cohere/Cohere-embed-multilingual-v3.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 512 not specified not specified 2023-11-02 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

GeoGPT-Research-Project/GeoEmbedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.2B 27.0 GB 2025-04-22 eng-Latn

GritLM/GritLM-7B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.2B 13.5 GB 2024-02-15 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{muennighoff2024generative,
      title={Generative Representational Instruction Tuning},
      author={Niklas Muennighoff and Hongjin Su and Liang Wang and Nan Yang and Furu Wei and Tao Yu and Amanpreet Singh and Douwe Kiela},
      year={2024},
      eprint={2402.09906},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

GritLM/GritLM-8x7B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 32768 57.9B 87.0 GB 2024-02-15 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{muennighoff2024generative,
      title={Generative Representational Instruction Tuning},
      author={Niklas Muennighoff and Hongjin Su and Liang Wang and Nan Yang and Furu Wei and Tao Yu and Amanpreet Singh and Douwe Kiela},
      year={2024},
      eprint={2402.09906},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 896 494.0M 1.8 GB 2024-10-23 eng-Latn, zho-Hans
Citation
@article{hu2025kalm,
  title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
  author={Hu, Xinshuo and Shan, Zifei and Zhao, Xinping and Sun, Zetian and Liu, Zhenyu and Li, Dongfang and Ye, Shaolin and Wei, Xinyuan and Chen, Qian and Hu, Baotian and others},
  journal={arXiv preprint arXiv:2501.01028},
  year={2025}
}

HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 896 494.0M 1.8 GB 2024-12-26 eng-Latn, zho-Hans
Citation
@article{hu2025kalm,
  title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
  author={Hu, Xinshuo and Shan, Zifei and Zhao, Xinping and Sun, Zetian and Liu, Zhenyu and Li, Dongfang and Ye, Shaolin and Wei, Xinyuan and Chen, Qian and Hu, Baotian and others},
  journal={arXiv preprint arXiv:2501.01028},
  year={2025}
}

HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 896 494.0M 942.0 MB 2025-06-25 eng-Latn, zho-Hans
Citation
@article{hu2025kalm,
  title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
  author={Hu, Xinshuo and Shan, Zifei and Zhao, Xinping and Sun, Zetian and Liu, Zhenyu and Li, Dongfang and Ye, Shaolin and Wei, Xinyuan and Chen, Qian and Hu, Baotian and others},
  journal={arXiv preprint arXiv:2501.01028},
  year={2025}
}

IEITYuan/Yuan-embedding-2.0-en

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 1024 595.8M 2.2 GB 2025-11-27 eng-Latn

KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 896 494.0M 1.8 GB 2025-09-30 eng-Latn, zho-Hans
Citation
@misc{zhao2025kalmembeddingv2,
      title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model}, 
      author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
      year={2025},
      eprint={2506.20923},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2506.20923}, 
}

@misc{hu2025kalmembedding,
      title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model}, 
      author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
      year={2025},
      eprint={2501.01028},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2501.01028}, 
}

Kingsoft-LLM/QZhou-Embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 3584 7.1B 14.1 GB 2025-08-24 eng-Latn, zho-Hans
Citation
@misc{yu2025qzhouembeddingtechnicalreport,
      title={QZhou-Embedding Technical Report},
      author={Peng Yu and En Xu and Bin Chen and Haibiao Chen and Yinfei Xu},
      year={2025},
      eprint={2508.21632},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2508.21632},
}

Kingsoft-LLM/QZhou-Embedding-Zh

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1792 7.6B 28.7 GB 2025-09-28 zho-Hans
Citation
@misc{yu2025qzhouembeddingtechnicalreport,
      title={QZhou-Embedding Technical Report},
      author={Peng Yu and En Xu and Bin Chen and Haibiao Chen and Yinfei Xu},
      year={2025},
      eprint={2508.21632},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2508.21632},
}

Linq-AI-Research/Linq-Embed-Mistral

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 13.2 GB 2024-05-29 eng-Latn
Citation
@misc{LinqAIResearch2024,
  title={Linq-Embed-Mistral:Elevating Text Retrieval with Improved GPT Data Through Task-Specific Control and Quality Refinement},
  author={Junseong Kim and Seolhwa Lee and Jihoon Kwon and Sangmo Gu and Yejin Kim and Minkyung Cho and Jy-yong Sohn and Chanyeol Choi},
  howpublished={Linq AI Research Blog},
  year={2024},
  url={https://getlinq.com/blog/linq-embed-mistral/}
}

McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 7.5B 28.0 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 7.5B 28.0 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-04-09 eng-Latn
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
      title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
      author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
      year={2024},
      eprint={2404.05961},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2404.05961},
}

MongoDB/mdbr-leaf-ir

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 22.9M 86.0 MB 2025-08-27 eng-Latn
Citation
@misc{mdbr_leaf,
  title={LEAF: Knowledge Distillation of Text Embedding Models with Teacher-Aligned Representations},
  author={Robin Vujanic and Thomas Rueckstiess},
  year={2025},
  eprint={2509.12539},
  archivePrefix={arXiv},
  primaryClass={cs.IR},
  url={https://arxiv.org/abs/2509.12539}
}

MongoDB/mdbr-leaf-mt

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 23.0M 86.0 MB 2025-08-27 eng-Latn
Citation
@misc{mdbr_leaf,
  title={LEAF: Knowledge Distillation of Text Embedding Models with Teacher-Aligned Representations},
  author={Robin Vujanic and Thomas Rueckstiess},
  year={2025},
  eprint={2509.12539},
  archivePrefix={arXiv},
  primaryClass={cs.IR},
  url={https://arxiv.org/abs/2509.12539}
}

NovaSearch/jasper_en_vision_language_v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K 8960 2.0B 3.7 GB 2024-12-11 eng-Latn
Citation
@misc{zhang2025jasperstelladistillationsota,
      title={Jasper and Stella: distillation of SOTA embedding models},
      author={Dun Zhang and Jiacheng Li and Ziyang Zeng and Fulong Wang},
      year={2025},
      eprint={2412.19048},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2412.19048},
}

NovaSearch/stella_en_1.5B_v5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K 8960 1.5B 5.7 GB 2024-07-12 eng-Latn
Citation
@misc{zhang2025jasperstelladistillationsota,
      title={Jasper and Stella: distillation of SOTA embedding models},
      author={Dun Zhang and Jiacheng Li and Ziyang Zeng and Fulong Wang},
      year={2025},
      eprint={2412.19048},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2412.19048},
}

NovaSearch/stella_en_400M_v5

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 435.0M 1.6 GB 2024-07-12 eng-Latn
Citation
@misc{zhang2025jasperstelladistillationsota,
      title={Jasper and Stella: distillation of SOTA embedding models},
      author={Dun Zhang and Jiacheng Li and Ziyang Zeng and Fulong Wang},
      year={2025},
      eprint={2412.19048},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2412.19048},
}

Qwen/Qwen3-Embedding-0.6B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1024 595.8M 1.1 GB 2025-06-05 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@article{qwen3embedding,
  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
  journal={arXiv preprint arXiv:2506.05176},
  year={2025}
}

Qwen/Qwen3-Embedding-4B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2560 4.0B 7.5 GB 2025-06-05 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@article{qwen3embedding,
  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
  journal={arXiv preprint arXiv:2506.05176},
  year={2025}
}

Qwen/Qwen3-Embedding-8B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.6B 14.1 GB 2025-06-05 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@article{qwen3embedding,
  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
  journal={arXiv preprint arXiv:2506.05176},
  year={2025}
}

ReasonIR/ReasonIR-8B

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K 4096 7.5B not specified 2025-04-29 eng-Latn
Citation
@article{shao2025reasonir,
      title={ReasonIR: Training Retrievers for Reasoning Tasks},
      author={Rulin Shao and Rui Qiao and Varsha Kishore and Niklas Muennighoff and Xi Victoria Lin and Daniela Rus and Bryan Kian Hsiang Low and Sewon Min and Wen-tau Yih and Pang Wei Koh and Luke Zettlemoyer},
      year={2025},
      journal={arXiv preprint arXiv:2504.20595},
      url={https://arxiv.org/abs/2504.20595},
}

Sailesh97/Hinvec

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 2048 939.6M 3.6 GB 2025-06-19 eng-Latn, hin-Deva

Salesforce/SFR-Embedding-2_R

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 13.2 GB 2024-06-14 eng-Latn
Citation
@misc{SFR-embedding-2,
      title={SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training},
      author={Rui Meng*, Ye Liu*, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz},
      year={2024},
      url={https://huggingface.co/Salesforce/SFR-Embedding-2_R}
    }

Salesforce/SFR-Embedding-Code-2B_R

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2304 2.6B 4.9 GB 2025-01-17 eng-Latn
Citation
@article{liu2024codexembed,
  title={CodeXEmbed: A Generalist Embedding Model Family for Multiligual and Multi-task Code Retrieval},
  author={Liu, Ye and Meng, Rui and Jot, Shafiq and Savarese, Silvio and Xiong, Caiming and Zhou, Yingbo and Yavuz, Semih},
  journal={arXiv preprint arXiv:2411.12644},
  year={2024}
}

Salesforce/SFR-Embedding-Mistral

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 13.2 GB 2024-01-24 eng-Latn
Citation
    @misc{SFRAIResearch2024,
  title={SFR-Embedding-Mistral:Enhance Text Retrieval with Transfer Learning},
  author={Rui Meng, Ye Liu, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz},
  howpublished={Salesforce AI Research Blog},
  year={2024},
  url={https://www.salesforce.com/blog/sfr-embedding/}
}

SamilPwC-AXNode-GenAI/PwC-Embedding_expr

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 560.0M 2.1 GB 2025-08-12 kor-Hang

Snowflake/snowflake-arctic-embed-l

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2024-04-12 eng-Latn
Citation
@article{merrick2024embedding,
      title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
      author={Merrick, Luke},
      journal={arXiv preprint arXiv:2407.18887},
      year={2024},
      eprint={2407.18887},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.18887}
}

Snowflake/snowflake-arctic-embed-l-v2.0

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 568.0M 2.1 GB 2024-12-04 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (74)
Citation
@article{yu2024arctic,
      title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise},
      author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel},
      journal={arXiv preprint arXiv:2412.04506},
      year={2024},
      eprint={2412.04506},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2412.04506}
}

Snowflake/snowflake-arctic-embed-m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 415.0 MB 2024-04-12 eng-Latn
Citation
@article{merrick2024embedding,
      title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
      author={Merrick, Luke},
      journal={arXiv preprint arXiv:2407.18887},
      year={2024},
      eprint={2407.18887},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.18887}
}

Snowflake/snowflake-arctic-embed-m-long

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 137.0M 522.0 MB 2024-04-12 eng-Latn
Citation
@article{merrick2024embedding,
      title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
      author={Merrick, Luke},
      journal={arXiv preprint arXiv:2407.18887},
      year={2024},
      eprint={2407.18887},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.18887}
}

Snowflake/snowflake-arctic-embed-m-v1.5

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 415.0 MB 2024-07-08 eng-Latn
Citation
@article{merrick2024embedding,
      title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
      author={Merrick, Luke},
      journal={arXiv preprint arXiv:2407.18887},
      year={2024},
      eprint={2407.18887},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.18887}
}

Snowflake/snowflake-arctic-embed-m-v2.0

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 305.0M 1.1 GB 2024-12-04 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (74)
Citation
@article{yu2024arctic,
      title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise},
      author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel},
      journal={arXiv preprint arXiv:2412.04506},
      year={2024},
      eprint={2412.04506},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2412.04506}
}

Snowflake/snowflake-arctic-embed-s

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 32.2M 127.0 MB 2024-04-12 eng-Latn
Citation
@article{merrick2024embedding,
      title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
      author={Merrick, Luke},
      journal={arXiv preprint arXiv:2407.18887},
      year={2024},
      eprint={2407.18887},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.18887}
}

Snowflake/snowflake-arctic-embed-xs

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 22.6M 86.0 MB 2024-07-08 eng-Latn
Citation
@article{merrick2024embedding,
      title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
      author={Merrick, Luke},
      journal={arXiv preprint arXiv:2407.18887},
      year={2024},
      eprint={2407.18887},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.18887}
}

Tarka-AIR/Tarka-Embedding-150M-V1

License: gemma

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 155.7M 576.0 MB 2025-11-04 arb-Arab, deu-Latn, eng-Latn, fra-Latn, jpn-Jpan, ... (8)
Citation
@misc{tarka_ai_research_2025,
    author       = { Tarka AI Research },
    title        = { Tarka-Embedding-150M-V1 (Revision c5f4f43) },
    year         = 2025,
    url          = { https://huggingface.co/Tarka-AIR/Tarka-Embedding-150M-V1 },
    doi          = { 10.57967/hf/6875 },
    publisher    = { Hugging Face }
}

Tarka-AIR/Tarka-Embedding-350M-V1

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 1024 354.5M 676.0 MB 2025-11-11 arb-Arab, deu-Latn, eng-Latn, fra-Latn, jpn-Jpan, ... (8)
Citation
@misc{tarka_ai_research_2025,
    author       = { Tarka AI Research },
    title        = { Tarka-Embedding-350M-V1 (Revision f4b5de8) },
    year         = 2025,
    url          = { https://huggingface.co/Tarka-AIR/Tarka-Embedding-350M-V1 },
    doi          = { 10.57967/hf/6979 },
    publisher    = { Hugging Face }
}

TencentBAC/Conan-embedding-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 not specified not specified 2025-04-10 eng-Latn, zho-Hans

VPLabs/SearchMap_Preview

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 435.0M 1.6 GB 2025-03-05 eng-Latn
Citation
@misc{vectorpath2025searchmap,
  title={SearchMap: Conversational E-commerce Search Embedding Model},
  author={VectorPath Research Team},
  year={2025},
  publisher={Hugging Face},
  journal={HuggingFace Model Hub},
}

WhereIsAI/UAE-Large-V1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2023-12-04 eng-Latn
Citation
    @article{li2023angle,
      title={AnglE-optimized Text Embeddings},
      author={Li, Xianming and Li, Jing},
      journal={arXiv preprint arXiv:2309.12871},
      year={2023}
    }

ai-forever/FRIDA

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1536 823.0M 3.1 GB 2024-12-29 rus-Cyrl

ai-forever/ru-en-RoSBERTa

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 404.0M 1.5 GB 2024-07-29 rus-Cyrl
Citation
@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
      title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
      author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov},
      year={2024},
      eprint={2408.12503},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2408.12503},
    }

ai-sage/Giga-Embeddings-instruct

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K 2048 3.2B 12.6 GB 2025-09-23 eng-Latn, rus-Cyrl

annamodels/LGAI-Embedding-Preview

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2025-06-11 eng-Latn
Citation
@misc{choi2025lgaiembeddingpreviewtechnicalreport,
      title={LGAI-EMBEDDING-Preview Technical Report},
      author={Jooyoung Choi and Hyun Kim and Hansol Jang and Changwook Jun and Kyunghoon Bae and Hyewon Choi and Stanley Jungkyu Choi and Honglak Lee and Chulmin Yun},
      year={2025},
      eprint={2506.07438},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2506.07438},
}

bedrock/cohere-embed-english-v3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 not specified not specified 2023-11-02 eng-Latn

bedrock/cohere-embed-multilingual-v3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 not specified not specified 2023-11-02 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

bflhc/MoD-Embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2560 4.0B 7.5 GB 2025-12-14 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@misc{mod-embedding-2025,
  title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
  author={MoD Team},
  year={2025},
  url={https://huggingface.co/bflhc/MoD-Embedding}
}

bflhc/Octen-Embedding-8B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.6B 14.1 GB 2025-12-23 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@misc{octen-embedding-2025,
  title={Octen-Embedding-8B: A Fine-tuned Multilingual Text Embedding Model},
  author={Octen Team},
  year={2025},
  url={https://huggingface.co/bflhc/bflhc/Octen-Embedding-8B}
}

castorini/repllama-v1-7b-lora-passage

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K 4096 7.0M 27.0 MB 2023-10-11 eng-Latn
Citation
@article{rankllama,
      title={Fine-Tuning LLaMA for Multi-Stage Text Retrieval},
      author={Xueguang Ma and Liang Wang and Nan Yang and Furu Wei and Jimmy Lin},
      year={2023},
      journal={arXiv:2310.08319},
}

cl-nagoya/ruri-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 111.2M 212.0 MB 2024-08-28 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-base-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 111.2M 424.0 MB 2024-12-05 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 337.4M 644.0 MB 2024-08-28 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-large-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 337.4M 1.3 GB 2024-12-06 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-small

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 68.1M 130.0 MB 2024-08-28 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-small-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 68.1M 260.0 MB 2024-12-05 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-v3-130m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 512 132.1M 504.0 MB 2025-04-09 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-v3-30m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 256 36.7M 140.0 MB 2025-04-07 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-v3-310m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 314.6M 1.2 GB 2025-04-09 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

cl-nagoya/ruri-v3-70m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 256 36.7M 140.0 MB 2025-04-09 jpn-Jpan
Citation
@misc{Ruri,
  title={{Ruri: Japanese General Text Embeddings}},
  author={Hayato Tsukagoshi and Ryohei Sasano},
  year={2024},
  eprint={2409.07737},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2409.07737},
}

clips/e5-base-trm-nl

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 124.4M 237.0 MB 2025-09-23 nld-Latn
Citation
@misc{banar2025mtebnle5nlembeddingbenchmark,
  archiveprefix = {arXiv},
  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
  eprint = {2509.12340},
  primaryclass = {cs.CL},
  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
  url = {https://arxiv.org/abs/2509.12340},
  year = {2025},
}

clips/e5-large-trm-nl

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 355.0M 1.3 GB 2025-09-23 nld-Latn
Citation
@misc{banar2025mtebnle5nlembeddingbenchmark,
  archiveprefix = {arXiv},
  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
  eprint = {2509.12340},
  primaryclass = {cs.CL},
  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
  url = {https://arxiv.org/abs/2509.12340},
  year = {2025},
}

clips/e5-small-trm-nl

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 40.8M 78.0 MB 2025-09-23 nld-Latn
Citation
@misc{banar2025mtebnle5nlembeddingbenchmark,
  archiveprefix = {arXiv},
  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
  eprint = {2509.12340},
  primaryclass = {cs.CL},
  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
  url = {https://arxiv.org/abs/2509.12340},
  year = {2025},
}

codefuse-ai/C2LLM-0.5B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 896 497.3M 948.0 MB 2025-12-22 eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (8)

codefuse-ai/C2LLM-7B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 7.7B 14.3 GB 2025-12-22 eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (8)

codefuse-ai/F2LLM-0.6B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 595.8M 1.1 GB 2025-09-18 eng-Latn
Citation
@article{2025F2LLM,
    title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
    author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
    journal={CoRR},
    volume={abs/2510.02294},
    year={2025},
    url={https://doi.org/10.48550/arXiv.2510.02294},
    doi={10.48550/ARXIV.2510.02294},
    eprinttype={arXiv},
    eprint={2510.02294}
}

codefuse-ai/F2LLM-1.7B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2560 1.7B 3.2 GB 2025-09-18 eng-Latn
Citation
@article{2025F2LLM,
    title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
    author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
    journal={CoRR},
    volume={abs/2510.02294},
    year={2025},
    url={https://doi.org/10.48550/arXiv.2510.02294},
    doi={10.48550/ARXIV.2510.02294},
    eprinttype={arXiv},
    eprint={2510.02294}
}

codefuse-ai/F2LLM-4B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2560 4.0B 7.5 GB 2025-09-18 eng-Latn
Citation
@article{2025F2LLM,
    title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
    author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
    journal={CoRR},
    volume={abs/2510.02294},
    year={2025},
    url={https://doi.org/10.48550/arXiv.2510.02294},
    doi={10.48550/ARXIV.2510.02294},
    eprinttype={arXiv},
    eprint={2510.02294}
}

deepvk/USER-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 427.0M 473.0 MB 2024-06-10 rus-Cyrl
Citation
@misc{deepvk2024user,
        title={USER: Universal Sentence Encoder for Russian},
        author={Malashenko, Boris and  Zemerov, Anton and Spirin, Egor},
        url={https://huggingface.co/datasets/deepvk/USER-base},
        publisher={Hugging Face}
        year={2024},
    }

deepvk/USER2-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 149.0M 568.0 MB 2025-04-19 rus-Cyrl
Citation
@misc{deepvk2025user,
    title={USER2},
    author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
    url={https://huggingface.co/deepvk/USER2-base},
    publisher={Hugging Face},
    year={2025},
}

deepvk/USER2-small

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 384 34.4M 131.0 MB 2025-04-19 rus-Cyrl
Citation
@misc{deepvk2025user,
    title={USER2},
    author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
    url={https://huggingface.co/deepvk/USER2-small},
    publisher={Hugging Face},
    year={2025},
}

emillykkejensen/EmbeddingGemma-Scandi-300m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 307.6M 578.0 MB 2025-10-17 dan-Latn, nno-Latn, nob-Latn, nor-Latn, swe-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

emillykkejensen/Qwen3-Embedding-Scandi-0.6B

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1024 595.8M 2.2 GB 2025-10-17 dan-Latn, nno-Latn, nob-Latn, nor-Latn, swe-Latn

emillykkejensen/mmBERTscandi-base-embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 306.9M 1.1 GB 2025-10-17 dan-Latn, nno-Latn, nob-Latn, nor-Latn, swe-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

fyaronskiy/english_code_retriever

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 149.0M 568.0 MB 2025-07-10 eng-Latn

google/embeddinggemma-300m

License: gemma

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 307.6M 1.1 GB 2025-09-04 arb-Arab, ben-Beng, deu-Latn, eng-Latn, fin-Latn, ... (19)
Citation
@misc{vera2025embeddinggemmapowerfullightweighttext,
      title={EmbeddingGemma: Powerful and Lightweight Text Representations},
      author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
      year={2025},
      eprint={2509.20354},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2509.20354},
}

google/gemini-embedding-001

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 3072 not specified not specified 2025-03-07 arb-Arab, ben-Beng, deu-Latn, eng-Latn, fin-Latn, ... (19)

google/text-embedding-004

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 not specified not specified 2024-05-14 eng-Latn

google/text-embedding-005

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 not specified not specified 2024-11-18 eng-Latn

google/text-multilingual-embedding-002

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 not specified not specified 2024-05-14 arb-Arab, ben-Beng, deu-Latn, eng-Latn, fin-Latn, ... (19)

infgrad/Jasper-Token-Compression-600M

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2048 595.8M 2.2 GB 2025-11-14 eng-Latn, zho-Hans
Citation
@misc{zhang2025jaspertokencompression600mtechnicalreport,
      title={Jasper-Token-Compression-600M Technical Report},
      author={Dun Zhang and Ziyang Zeng and Yudong Zhou and Shuyang Lu},
      year={2025},
      eprint={2511.14405},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2511.14405},
}

infly/inf-retriever-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 7.1B 13.2 GB 2024-12-24 eng-Latn, zho-Hans
Citation
@misc{infly-ai_2025,
  author       = {Junhan Yang and Jiahe Wan and Yichen Yao and Wei Chu and Yinghui Xu and Yuan Qi},
  title        = {inf-retriever-v1 (Revision 5f469d7)},
  year         = 2025,
  url          = {https://huggingface.co/infly/inf-retriever-v1},
  doi          = {10.57967/hf/4262},
  publisher    = {Hugging Face}
}

infly/inf-retriever-v1-1.5b

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1536 1.5B 2.9 GB 2025-02-08 eng-Latn, zho-Hans
Citation
@misc{infly-ai_2025,
  author       = {Junhan Yang and Jiahe Wan and Yichen Yao and Wei Chu and Yinghui Xu and Yuan Qi},
  title        = {inf-retriever-v1 (Revision 5f469d7)},
  year         = 2025,
  url          = {https://huggingface.co/infly/inf-retriever-v1},
  doi          = {10.57967/hf/4262},
  publisher    = {Hugging Face}
}

intfloat/e5-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 418.0 MB 2022-12-26 eng-Latn
Citation
@article{wang2022text,
  title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2212.03533},
  year={2022}
}

intfloat/e5-base-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 418.0 MB 2024-02-08 eng-Latn
Citation
@article{wang2022text,
  title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2212.03533},
  year={2022}
}

intfloat/e5-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2022-12-26 eng-Latn
Citation
@article{wang2022text,
  title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2212.03533},
  year={2022}
}

intfloat/e5-large-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 335.0M 1.2 GB 2024-02-08 eng-Latn
Citation
@article{wang2022text,
  title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2212.03533},
  year={2022}
}

intfloat/e5-mistral-7b-instruct

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 13.2 GB 2024-02-08 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
    @article{wang2023improving,
      title={Improving Text Embeddings with Large Language Models},
      author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
      journal={arXiv preprint arXiv:2401.00368},
      year={2023}
    }

    @article{wang2022text,
      title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
      author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
      journal={arXiv preprint arXiv:2212.03533},
      year={2022}
    }

intfloat/e5-small

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 33.0M 127.0 MB 2024-02-08 eng-Latn
Citation
@article{wang2022text,
  title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2212.03533},
  year={2022}
}

intfloat/e5-small-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 33.0M 127.0 MB 2024-02-08 eng-Latn
Citation
@article{wang2022text,
  title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2212.03533},
  year={2022}
}

intfloat/multilingual-e5-base

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 278.0M 1.0 GB 2024-02-08 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{wang2024multilingual,
  title={Multilingual E5 Text Embeddings: A Technical Report},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2402.05672},
  year={2024}
}

intfloat/multilingual-e5-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 560.0M 2.1 GB 2024-02-08 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{wang2024multilingual,
  title={Multilingual E5 Text Embeddings: A Technical Report},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2402.05672},
  year={2024}
}

intfloat/multilingual-e5-large-instruct

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 560.0M 1.0 GB 2024-02-08 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{wang2024multilingual,
      title={Multilingual E5 Text Embeddings: A Technical Report},
      author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
      journal={arXiv preprint arXiv:2402.05672},
      year={2024}
    }

intfloat/multilingual-e5-small

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 118.0M 449.0 MB 2024-02-08 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{wang2024multilingual,
  title={Multilingual E5 Text Embeddings: A Technical Report},
  author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
  journal={arXiv preprint arXiv:2402.05672},
  year={2024}
}

jinaai/jina-embeddings-v3

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 572.0M 1.1 GB 2024-09-18 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
    @misc{sturua2024jinaembeddingsv3multilingualembeddingstask,
      title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA},
      author={Saba Sturua and Isabelle Mohr and Mohammad Kalim Akram and Michael Günther and Bo Wang and Markus Krimmel and Feng Wang and Georgios Mastrapas and Andreas Koukounas and Andreas Koukounas and Nan Wang and Han Xiao},
      year={2024},
      eprint={2409.10173},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2409.10173},
    }

jxm/cde-small-v1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 281.0M 1.0 GB 2024-09-24 eng-Latn
Citation
@misc{morris2024contextualdocumentembeddings,
    title={Contextual Document Embeddings},
    author={John X. Morris and Alexander M. Rush},
    year={2024},
    eprint={2410.02525},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2410.02525},
}

jxm/cde-small-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 306.0M 1.1 GB 2025-01-13 eng-Latn
Citation
@misc{morris2024contextualdocumentembeddings,
    title={Contextual Document Embeddings},
    author={John X. Morris and Alexander M. Rush},
    year={2024},
    eprint={2410.02525},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2410.02525},
}

llamaindex/vdr-2b-multi-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1536 2.0B 4.1 GB 2024-01-08 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn

manveertamber/cadet-embed-base-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.0M 418.0 MB 2025-05-11 eng-Latn
Citation
@article{tamber2025conventionalcontrastivelearningfalls,
    title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
    author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
    journal={arXiv:2505.19274},
    year={2025}
}

mixedbread-ai/mxbai-embed-2d-large-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 335.0M not specified 2024-03-04 eng-Latn

mixedbread-ai/mxbai-embed-large-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 639.0 MB 2024-03-07 eng-Latn
Citation
    @online{emb2024mxbai,
      title={Open Source Strikes Bread - New Fluffy Embeddings Model},
      author={Sean Lee and Aamir Shakir and Darius Koenig and Julius Lipp},
      year={2024},
      url={https://www.mixedbread.ai/blog/mxbai-embed-large-v1},
    }

    @article{li2023angle,
      title={AnglE-optimized Text Embeddings},
      author={Li, Xianming and Li, Jing},
      journal={arXiv preprint arXiv:2309.12871},
      year={2023}
    }

mixedbread-ai/mxbai-embed-xsmall-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 24.1M not specified 2024-08-13 eng-Latn
Citation
@online{xsmall2024mxbai,
  title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
  author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
  year={2024},
  url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
}

nomic-ai/modernbert-embed-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 149.0M 568.0 MB 2024-12-29 eng-Latn
Citation
@misc{nussbaum2024nomic,
      title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
      author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
      year={2024},
      eprint={2402.01613},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

nomic-ai/nomic-embed-text-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 not specified 522.0 MB 2024-01-31 eng-Latn
Citation
@misc{nussbaum2024nomic,
      title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
      author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
      year={2024},
      eprint={2402.01613},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

nomic-ai/nomic-embed-text-v1-ablated

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 not specified not specified 2024-01-15 eng-Latn

nomic-ai/nomic-embed-text-v1-unsupervised

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 not specified not specified 2024-01-15 eng-Latn

nomic-ai/nomic-embed-text-v1.5

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 137.0M 522.0 MB 2024-02-10 eng-Latn
Citation
@misc{nussbaum2024nomic,
      title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
      author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
      year={2024},
      eprint={2402.01613},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

nomic-ai/nomic-embed-text-v2-moe

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 475.3M 1.8 GB 2025-02-07 amh-Ethi, arb-Arab, bel-Cyrl, ben-Beng, bul-Cyrl, ... (98)
Citation
@misc{nussbaum2025trainingsparsemixtureexperts,
      title={Training Sparse Mixture Of Experts Text Embedding Models},
      author={Zach Nussbaum and Brandon Duderstadt},
      year={2025},
      eprint={2502.07972},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2502.07972},
}

nvidia/NV-Embed-v1

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.8B 14.6 GB 2024-09-13 eng-Latn
Citation
@misc{moreira2025nvretrieverimprovingtextembedding,
      title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
      author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2407.15831},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2407.15831}
}

nvidia/NV-Embed-v2

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.8B 14.6 GB 2024-09-09 eng-Latn
Citation
@misc{moreira2025nvretrieverimprovingtextembedding,
      title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
      author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2407.15831},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2407.15831}
}

nvidia/llama-embed-nemotron-8b

License: https://huggingface.co/nvidia/llama-embed-nemotron-8b/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.5B 28.0 GB 2025-10-23 afr-Latn, amh-Ethi, ara-Arab, arq-Arab, ary-Arab, ... (66)
Citation
@misc{moreira2025nvretrieverimprovingtextembedding,
      title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
      author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2407.15831},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2407.15831}
}

opensearch-project/opensearch-neural-sparse-encoding-doc-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 30522 133.0M 507.0 MB 2024-03-07 eng-Latn

opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 30522 67.0M 267.0 MB 2024-07-17 eng-Latn

opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 30522 22.7M 86.0 MB 2024-07-18 eng-Latn

opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 30522 67.0M 267.0 MB 2025-03-28 eng-Latn

opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 30522 137.4M 549.0 MB 2025-06-18 eng-Latn

samaya-ai/RepLLaMA-reproduced

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K 4096 7.0M 27.0 MB 2024-09-15 eng-Latn
Citation
@article{rankllama,
      title={Fine-Tuning LLaMA for Multi-Stage Text Retrieval},
      author={Xueguang Ma and Liang Wang and Nan Yang and Furu Wei and Jimmy Lin},
      year={2023},
      journal={arXiv:2310.08319},
}

samaya-ai/promptriever-llama2-7b-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K 4096 7.0B 26.1 GB 2024-09-15 eng-Latn
Citation
@article{weller2024promptriever,
      title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
      author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
      year={2024},
      eprint={2409.11136},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2409.11136},
}

samaya-ai/promptriever-llama3.1-8b-instruct-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 8.0B 29.8 GB 2024-09-15 eng-Latn
Citation
@article{weller2024promptriever,
      title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
      author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
      year={2024},
      eprint={2409.11136},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2409.11136},
}

samaya-ai/promptriever-llama3.1-8b-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 8.0B 29.8 GB 2024-09-15 eng-Latn
Citation
@article{weller2024promptriever,
      title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
      author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
      year={2024},
      eprint={2409.11136},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2409.11136},
}

samaya-ai/promptriever-mistral-v0.1-7b-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K 4096 7.0B 26.1 GB 2024-09-15 eng-Latn
Citation
@article{weller2024promptriever,
      title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
      author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
      year={2024},
      eprint={2409.11136},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2409.11136},
}

sbintuitions/sarashina-embedding-v2-1b

License: https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1792 1.2B 4.6 GB 2025-07-30 jpn-Jpan

sergeyzh/BERTA

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 128.0M 489.0 MB 2025-03-10 rus-Cyrl

sergeyzh/rubert-mini-frida

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 312 32.3M 123.0 MB 2025-03-02 rus-Cyrl

tencent/KaLM-Embedding-Gemma3-12B-2511

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3840 11.8B 43.8 GB 2025-11-06 not specified
Citation
@misc{zhao2025kalmembeddingv2,
      title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model}, 
      author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
      year={2025},
      eprint={2506.20923},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2506.20923}, 
}

@misc{hu2025kalmembedding,
      title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model}, 
      author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
      year={2025},
      eprint={2501.01028},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2501.01028}, 
}

tencent/Youtu-Embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2048 2.7B not specified 2025-09-28 zho-Hans
Citation
@misc{zhang2025codiemb,
  title={CoDiEmb: A Collaborative yet Distinct Framework for Unified Representation Learning in Information Retrieval and Semantic Textual Similarity},
  author={Zhang, Bowen and Song, Zixin and Chen, Chunquan and Zhang, Qian-Wen and Yin, Di and Sun, Xing},
  year={2025},
  eprint={2508.11442},
  archivePrefix={arXiv},
  url={https://arxiv.org/abs/2508.11442},
}

voyageai/voyage-2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.0K 1024 not specified not specified 2023-10-29 not specified

voyageai/voyage-3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2024-09-18 not specified

voyageai/voyage-3-large

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2025-01-07 not specified

voyageai/voyage-3-lite

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 512 not specified not specified 2024-09-18 not specified

voyageai/voyage-3-m-exp

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 2048 6.9B not specified 2025-01-08 eng-Latn

voyageai/voyage-3.5

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2025-01-21 not specified

voyageai/voyage-3.5 (output_dtype=binary)

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2025-01-21 not specified

voyageai/voyage-3.5 (output_dtype=int8)

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2025-01-21 not specified

voyageai/voyage-code-2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.0K 1536 not specified not specified 2024-01-23 not specified

voyageai/voyage-code-3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2024-12-04 not specified

voyageai/voyage-finance-2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2024-05-30 not specified

voyageai/voyage-large-2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.0K 1536 not specified not specified 2023-10-29 not specified

voyageai/voyage-large-2-instruct

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.0K 1024 not specified not specified 2024-05-05 not specified

voyageai/voyage-law-2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.0K 1024 not specified not specified 2024-04-15 not specified

voyageai/voyage-multilingual-2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.0K 1024 not specified not specified 2024-06-10 not specified

yibinlei/LENS-d4000

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4000 7.1B 26.5 GB 2025-01-17 not specified
Citation
@article{lei2025lens,
  title={Enhancing Lexicon-Based Text Embeddings with Large Language Models},
  author={Lei, Yibin and Shen, Tao and Cao, Yu and Yates, Andrew},
  journal={arXiv preprint arXiv:2501.09749},
  year={2025}
}

yibinlei/LENS-d8000

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 8000 7.1B 26.5 GB 2025-01-17 not specified
Citation
@article{lei2025lens,
  title={Enhancing Lexicon-Based Text Embeddings with Large Language Models},
  author={Lei, Yibin and Shen, Tao and Cao, Yu and Yates, Andrew},
  journal={arXiv preprint arXiv:2501.09749},
  year={2025}
}

zeta-alpha-ai/Zeta-Alpha-E5-Mistral

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 13.2 GB 2024-08-30 eng-Latn

Non-instruction Model

AITeamVN/Vietnamese_Embedding

License: cc-by-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 568.0M 2.1 GB 2024-03-17 vie-Latn
Citation
@misc{Vietnamese_Embedding,
  title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
  author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
  year={2025},
  publisher={Huggingface},
}

Alibaba-NLP/gte-base-en-v1.5

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 137.0M not specified 2024-06-20 eng-Latn
Citation
@misc{zhang2024mgte,
  title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval}, 
  author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
  year={2024},
  eprint={2407.19669},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2407.19669}, 
}
@misc{li2023gte,
  title={Towards General Text Embeddings with Multi-stage Contrastive Learning}, 
  author={Zehan Li and Xin Zhang and Yanzhao Zhang and Dingkun Long and Pengjun Xie and Meishan Zhang},
  year={2023},
  eprint={2308.03281},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2308.03281}, 
}

Alibaba-NLP/gte-modernbert-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 149.0M 284.0 MB 2025-01-21 eng-Latn
Citation
@inproceedings{zhang2024mgte,
  title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
  author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
  booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
  pages={1393--1412},
  year={2024}
}

@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

Alibaba-NLP/gte-multilingual-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 305.0M 582.0 MB 2024-07-20 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@inproceedings{zhang2024mgte,
  title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
  author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
  booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
  pages={1393--1412},
  year={2024}
}

BAAI/bge-en-icl

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 4096 7.1B 26.5 GB 2024-07-25 eng-Latn
Citation
    @misc{li2024makingtextembeddersfewshot,
      title={Making Text Embedders Few-Shot Learners},
      author={Chaofan Li and MingHao Qin and Shitao Xiao and Jianlyu Chen and Kun Luo and Yingxia Shao and Defu Lian and Zheng Liu},
      year={2024},
      eprint={2409.15700},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2409.15700},
}

BAAI/bge-m3

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 568.0M 2.1 GB 2024-06-28 afr-Latn, amh-Ethi, ast-Latn, azj-Latn, azj-Latn, ... (29)
Citation
@misc{bge-m3,
      title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, 
      author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
      year={2024},
      eprint={2402.03216},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-m3-unsupervised

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 568.0M 2.1 GB 2024-01-30 afr-Latn, amh-Ethi, ast-Latn, azj-Latn, azj-Latn, ... (29)
Citation
@misc{bge-m3,
      title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, 
      author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
      year={2024},
      eprint={2402.03216},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-multilingual-gemma2

License: https://ai.google.dev/gemma/terms

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 3584 9.2B 34.4 GB 2024-07-25 eng-Latn, fra-Latn, jpn-Jpan, jpn-Latn, kor-Hang, ... (7)
Citation
@misc{bge-m3,
      title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, 
      author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
      year={2024},
      eprint={2402.03216},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}


@misc{bge_embedding,
      title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, 
      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
      year={2023},
      eprint={2309.07597},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

BAAI/bge-reranker-v2-m3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 2.1 GB 2024-06-24 ara-Arab, ben-Beng, dan-Latn, deu-Latn, eng-Latn, ... (32)
Citation
    @misc{li2023making,
      title={Making Large Language Models A Better Foundation For Dense Retrieval},
      author={Chaofan Li and Zheng Liu and Shitao Xiao and Yingxia Shao},
      year={2023},
      eprint={2312.15503},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
    }
    @misc{chen2024bge,
          title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
          author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
          year={2024},
          eprint={2402.03216},
          archivePrefix={arXiv},
          primaryClass={cs.CL}
    }

ByteDance/ListConRanker

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 401.0M 1.2 GB 2024-12-11 zho-Hans
Citation
@article{liu2025listconranker,
  title={ListConRanker: A Contrastive Text Reranker with Listwise Encoding},
  author={Liu, Junlong and Ma, Yue and Zhao, Ruihui and Zheng, Junhao and Ma, Qianli and Kang, Yangyang},
  journal={arXiv preprint arXiv:2501.07111},
  year={2025}
}

Classical/Yinka

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 326.0M 1.2 GB 2024-01-09 zho-Hans

DMetaSoul/Dmeta-embedding-zh-small

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
1.0K 768 74.2M 283.0 MB 2024-03-25 zho-Hans

DMetaSoul/sbert-chinese-general-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 128 not specified not specified 2022-03-25 zho-Hans

DeepPavlov/distilrubert-small-cased-conversational

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 107.0M 408.0 MB 2022-06-28 rus-Cyrl
Citation
@misc{https://doi.org/10.48550/arxiv.2205.02340,
      doi = {10.48550/ARXIV.2205.02340},
      url = {https://arxiv.org/abs/2205.02340},
      author = {Kolesnikova, Alina and Kuratov, Yuri and Konovalov, Vasily and Burtsev, Mikhail},
      keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
      title = {Knowledge Distillation of Russian Language Models with Reduction of Vocabulary},
      publisher = {arXiv},
      year = {2022},
      copyright = {arXiv.org perpetual, non-exclusive license}
    }

DeepPavlov/rubert-base-cased

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 1.3B 4.8 GB 2020-03-04 rus-Cyrl
Citation
@misc{kuratov2019adaptationdeepbidirectionalmultilingual,
      title={Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language},
      author={Yuri Kuratov and Mikhail Arkhipov},
      year={2019},
      eprint={1905.07213},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1905.07213},
    }

DeepPavlov/rubert-base-cased-sentence

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 107.0M 408.0 MB 2020-03-04 rus-Cyrl

FacebookAI/xlm-roberta-base

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 278.0M 1.0 GB 2019-11-05 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{DBLP:journals/corr/abs-1911-02116,
  author    = {Alexis Conneau and
               Kartikay Khandelwal and
               Naman Goyal and
               Vishrav Chaudhary and
               Guillaume Wenzek and
               Francisco Guzm{'{a}}n and
               Edouard Grave and
               Myle Ott and
               Luke Zettlemoyer and
               Veselin Stoyanov},
  title     = {Unsupervised Cross-lingual Representation Learning at Scale},
  journal   = {CoRR},
  volume    = {abs/1911.02116},
  year      = {2019},
  url       = {http://arxiv.org/abs/1911.02116},
  eprinttype = {arXiv},
  eprint    = {1911.02116},
  timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

FacebookAI/xlm-roberta-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 559.9M 2.1 GB 2019-11-05 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{DBLP:journals/corr/abs-1911-02116,
  author    = {Alexis Conneau and
               Kartikay Khandelwal and
               Naman Goyal and
               Vishrav Chaudhary and
               Guillaume Wenzek and
               Francisco Guzm{'{a}}n and
               Edouard Grave and
               Myle Ott and
               Luke Zettlemoyer and
               Veselin Stoyanov},
  title     = {Unsupervised Cross-lingual Representation Learning at Scale},
  journal   = {CoRR},
  volume    = {abs/1911.02116},
  year      = {2019},
  url       = {http://arxiv.org/abs/1911.02116},
  eprinttype = {arXiv},
  eprint    = {1911.02116},
  timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

Gameselo/STS-multilingual-mpnet-base-v2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 278.0M 1.0 GB 2024-06-07 not specified
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1

License: cc-by-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 568.0M 2.1 GB 2024-04-11 vie-Latn

GreenNode/GreenNode-Embedding-Large-VN-V1

License: cc-by-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 568.0M 2.1 GB 2024-04-11 vie-Latn

HIT-TMG/KaLM-embedding-multilingual-mini-v1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 896 494.0M 1.8 GB 2024-08-27 eng-Latn, zho-Hans
Citation
@article{hu2025kalm,
  title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
  author={Hu, Xinshuo and Shan, Zifei and Zhao, Xinping and Sun, Zetian and Liu, Zhenyu and Li, Dongfang and Ye, Shaolin and Wei, Xinyuan and Chen, Qian and Hu, Baotian and others},
  journal={arXiv preprint arXiv:2501.01028},
  year={2025}
}

Haon-Chen/speed-embedding-7b-instruct

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K not specified 7.1B 13.2 GB 2024-10-31 eng-Latn
Citation
@article{chen2024little,
    title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
    author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
    journal={arXiv preprint arXiv:2410.18634},
    year={2024}
}

HooshvareLab/bert-base-parsbert-uncased

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 162.8M 621.0 MB 2021-05-19 fas-Arab
Citation
    @article{ParsBERT,
    title={ParsBERT: Transformer-based Model for Persian Language Understanding},
    author={Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri},
    journal={ArXiv},
    year={2020},
    volume={abs/2005.12515}
}

Hum-Works/lodestone-base-4096-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 not specified not specified 2023-08-25 eng-Latn

Human

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified not specified ara-Arab, dan-Latn, eng-Latn, nob-Latn, rus-Cyrl

IEITYuan/Yuan-embedding-2.0-zh

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1792 326.0M 1.2 GB 2025-11-24 zho-Hans

Jaume/gemma-2b-embeddings

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2048 2.5B 9.3 GB 2024-06-29 not specified

KBLab/sentence-bert-swedish-cased

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
384 768 124.7M 476.0 MB 2023-01-11 swe-Latn
Citation
@misc{rekathati2021introducing,  
  author = {Rekathati, Faton},  
  title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},  
  url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},  
  year = {2021}  
}

KFST/XLMRoberta-en-da-sv-nb

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 278.0M 1.0 GB 2022-02-22 dan-Latn, eng-Latn, nno-Latn, nob-Latn, swe-Latn

KennethEnevoldsen/dfm-sentence-encoder-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 355.1M 1.5 GB 2023-07-12 dan-Latn
Citation
@article{enevoldsenScandinavianEmbeddingBenchmarks2024,
    title = {The {Scandinavian} {Embedding} {Benchmarks}: {Comprehensive} {Assessment} of {Multilingual} and {Monolingual} {Text} {Embedding}},
    shorttitle = {The {Scandinavian} {Embedding} {Benchmarks}},
    url = {https://openreview.net/forum?id=pJl_i7HIA72},
    language = {en},
    urldate = {2024-04-12},
    author = {Enevoldsen, Kenneth and Kardos, Márton and Muennighoff, Niklas and Nielbo, Kristoffer},
    month = feb,
    year = {2024},
}

KennethEnevoldsen/dfm-sentence-encoder-medium

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 124.4M 475.0 MB 2023-07-12 dan-Latn
Citation
@article{enevoldsenScandinavianEmbeddingBenchmarks2024,
    title = {The {Scandinavian} {Embedding} {Benchmarks}: {Comprehensive} {Assessment} of {Multilingual} and {Monolingual} {Text} {Embedding}},
    shorttitle = {The {Scandinavian} {Embedding} {Benchmarks}},
    url = {https://openreview.net/forum?id=pJl_i7HIA72},
    language = {en},
    urldate = {2024-04-12},
    author = {Enevoldsen, Kenneth and Kardos, Márton and Muennighoff, Niklas and Nielbo, Kristoffer},
    month = feb,
    year = {2024},
}

Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128 768 278.0M 1.0 GB 2025-11-10 ben-Beng
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

Lajavaness/bilingual-embedding-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 278.0M 1.0 GB 2024-06-26 not specified
Citation
    @article{conneau2019unsupervised,
  title={Unsupervised cross-lingual representation learning at scale},
  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
  journal={arXiv preprint arXiv:1911.02116},
  year={2019}
}

@article{reimers2019sentence,
   title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
   author={Nils Reimers, Iryna Gurevych},
   journal={https://arxiv.org/abs/1908.10084},
   year={2019}
}

@article{thakur2020augmented,
  title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
  author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
  journal={arXiv e-prints},
  pages={arXiv--2010},
  year={2020}
}

Lajavaness/bilingual-embedding-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 559.9M 2.1 GB 2024-06-24 eng-Latn, fra-Latn
Citation
    @article{conneau2019unsupervised,
  title={Unsupervised cross-lingual representation learning at scale},
  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
  journal={arXiv preprint arXiv:1911.02116},
  year={2019}
}

@article{reimers2019sentence,
   title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
   author={Nils Reimers, Iryna Gurevych},
   journal={https://arxiv.org/abs/1908.10084},
   year={2019}
}

@article{thakur2020augmented,
  title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
  author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
  journal={arXiv e-prints},
  pages={arXiv--2010},
  year={2020}
}

Lajavaness/bilingual-embedding-small

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 117.7M 449.0 MB 2024-07-17 eng-Latn, fra-Latn
Citation
    @article{conneau2019unsupervised,
  title={Unsupervised cross-lingual representation learning at scale},
  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
  journal={arXiv preprint arXiv:1911.02116},
  year={2019}
}

@article{reimers2019sentence,
   title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
   author={Nils Reimers, Iryna Gurevych},
   journal={https://arxiv.org/abs/1908.10084},
   year={2019}
}

@article{thakur2020augmented,
  title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
  author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
  journal={arXiv e-prints},
  pages={arXiv--2010},
  year={2020}
}

MCINext/Hakim

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 124.4M 475.0 MB 2025-05-10 fas-Arab
Citation
@article{sarmadi2025hakim,
      title={Hakim: Farsi Text Embedding Model},
      author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
      journal={arXiv preprint arXiv:2505.08435},
      year={2025}
}

MCINext/Hakim-small

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 38.7M 148.0 MB 2025-05-10 fas-Arab
Citation
@article{sarmadi2025hakim,
      title={Hakim: Farsi Text Embedding Model},
      author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
      journal={arXiv preprint arXiv:2505.08435},
      year={2025}
}

MCINext/Hakim-unsup

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 124.4M 475.0 MB 2025-05-10 fas-Arab
Citation
@article{sarmadi2025hakim,
      title={Hakim: Farsi Text Embedding Model},
      author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
      journal={arXiv preprint arXiv:2505.08435},
      year={2025}
}

Mihaiii/Bulbasaur

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 17.4M 66.0 MB 2024-04-27 not specified

Mihaiii/Ivysaur

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 22.7M 87.0 MB 2024-04-27 not specified

Mihaiii/Squirtle

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 15.6M 60.0 MB 2024-04-30 not specified

Mihaiii/Venusaur

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 15.6M 60.0 MB 2024-04-29 not specified

Mihaiii/Wartortle

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 17.4M 66.0 MB 2024-04-30 not specified

Mihaiii/gte-micro

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 17.4M 66.0 MB 2024-04-21 not specified

Mihaiii/gte-micro-v4

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 19.2M 73.0 MB 2024-04-22 not specified

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
1.5K 4096 8.0B 15.3 GB 2025-11-06 eng-Latn
Citation
@misc{euler2025legal,
      title={Euler-Legal-Embedding: Advanced Legal Representation Learning}, 
      author={LawRank Team},
      year={2025},
      publisher={Hugging Face}
}

NbAiLab/nb-bert-base

License: cc-by-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 177.9M 681.0 MB 2021-01-13 nno-Latn, nob-Latn

NbAiLab/nb-bert-large

License: cc-by-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 355.1M 1.3 GB 2021-04-29 nno-Latn, nob-Latn

NbAiLab/nb-sbert-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
75 4096 1.8B 678.0 MB 2022-11-23 dan-Latn, nno-Latn, nob-Latn, swe-Latn

NeuML/pubmedbert-base-embeddings-100K

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 64 100.0K 0.0 MB 2025-01-03 eng-Latn

NeuML/pubmedbert-base-embeddings-1M

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 64 1.0M 2.0 MB 2025-01-03 eng-Latn

NeuML/pubmedbert-base-embeddings-2M

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 64 1.9M 7.0 MB 2025-01-03 eng-Latn

NeuML/pubmedbert-base-embeddings-500K

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 64 500.0K 2.0 MB 2025-01-03 eng-Latn

NeuML/pubmedbert-base-embeddings-8M

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 7.8M 30.0 MB 2025-01-03 eng-Latn

Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 135.2M 516.0 MB 2024-06-16 ara-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 117.7M 449.0 MB 2024-06-25 ara-Arab

Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
768 768 135.0M 516.0 MB 2024-07-28 ara-Arab
Citation
    @article{nacar2025gate,
    title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Matryoshka Representation Learning and Hybrid Loss Training},
    author={Nacar, Omer and Koubaa, Anis and Sibaee, Serry and Al-Habashi, Yasser and Ammar, Adel and Boulila, Wadii},
    journal={arXiv preprint arXiv:2505.24581},
    year={2025}
}

Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 278.0M 1.0 GB 2024-06-14 ara-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

Omartificial-Intelligence-Space/Arabic-labse-Matryoshka

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 470.9M 1.8 GB 2024-06-16 ara-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 109.5M 418.0 MB 2024-06-15 ara-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 162.8M 621.0 MB 2024-06-17 ara-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

OpenSearch-AI/Ops-MoA-Conan-embedding-v1

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1536 343.0M 1.3 GB 2025-03-26 zho-Hans

OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1536 343.0M 1.2 GB 2025-03-26 zho-Hans

OrdalieTech/Solon-embeddings-large-0.1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 559.9M 2.1 GB 2023-12-09 fra-Latn

OrdalieTech/Solon-embeddings-mini-beta-1.1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 210.0M 808.0 MB 2025-01-01 fra-Latn

OrlikB/KartonBERT-USE-base-v1

License: gpl-3.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 103.7M 396.0 MB 2024-09-30 pol-Latn

OrlikB/st-polish-kartonberta-base-alpha-v1

License: lgpl

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 not specified not specified 2023-11-12 pol-Latn

PartAI/Tooka-SBERT

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 353.0M 1.3 GB 2024-12-07 fas-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

PartAI/Tooka-SBERT-V2-Large

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 353.0M 1.3 GB 2025-05-01 fas-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

PartAI/Tooka-SBERT-V2-Small

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 122.9M 496.0 MB 2025-05-01 fas-Arab
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

PartAI/TookaBERT-Base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 122.9M 469.0 MB 2024-12-08 fas-Arab

Qodo/Qodo-Embed-1-1.5B

License: https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1536 1.8B 6.6 GB 2025-02-19 c#-Code, c++-Code, go-Code, java-Code, javascript-Code, ... (9)

Qodo/Qodo-Embed-1-7B

License: https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 7.6B 28.4 GB 2025-02-24 c#-Code, c++-Code, go-Code, java-Code, javascript-Code, ... (9)

Shuu12121/CodeSearch-ModernBERT-Crow-Plus

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
1.0K 768 151.7M 607.0 MB 2025-04-21 eng-Latn

TencentBAC/Conan-embedding-v1

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 326.0M 1.2 GB 2024-08-22 zho-Hans
Citation
@misc{li2024conanembeddinggeneraltextembedding,
  title={Conan-embedding: General Text Embedding with More and Better Negative Samples}, 
  author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
  year={2024},
  eprint={2408.15710},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2408.15710}, 
}

VoVanPhuc/sup-SimCSE-VietNamese-phobert-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
256 768 135.0M 517.0 MB 2021-05-26 vie-Latn
Citation
@article{gao2021simcse,
   title={{SimCSE}: Simple Contrastive Learning of Sentence Embeddings},
   author={Gao, Tianyu and Yao, Xingcheng and Chen, Danqi},
   journal={arXiv preprint arXiv:2104.08821},
   year={2021}
}

@inproceedings{phobert,
title     = {{PhoBERT: Pre-trained language models for Vietnamese}},
author    = {Dat Quoc Nguyen and Anh Tuan Nguyen},
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
year      = {2020},
pages     = {1037--1042}
}

aari1995/German_Semantic_STS_V2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.7M 1.3 GB 2022-11-17 deu-Latn

abhinand/MedEmbed-small-v0.1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 33.4M 127.0 MB 2024-10-20 eng-Latn
Citation
@software{balachandran2024medembed,
  author = {Balachandran, Abhinand},
  title = {MedEmbed: Medical-Focused Embedding Models},
  year = {2024},
  url = {https://github.com/abhinand5/MedEmbed}
}

ai-forever/sbert_large_mt_nlu_ru

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 427.0M 1.6 GB 2021-05-18 rus-Cyrl

ai-forever/sbert_large_nlu_ru

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 427.0M 1.6 GB 2020-11-20 rus-Cyrl

amazon/Titan-text-embeddings-v2

License: https://aws.amazon.com/service-terms/

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2024-04-30 eng-Latn

andersborges/model2vecdk

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 48.0M 183.0 MB 2025-11-21 dan-Latn
Citation
@article{minishlab2024model2vec,
  author = {Tulkens, Stephan and {van Dongen}, Thomas},
  title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
  year = {2024},
  url = {https://github.com/MinishLab/model2vec}
}

andersborges/model2vecdk-stem

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 48.6M 185.0 MB 2025-11-21 dan-Latn
Citation
@article{minishlab2024model2vec,
  author = {Tulkens, Stephan and {van Dongen}, Thomas},
  title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
  year = {2024},
  url = {https://github.com/MinishLab/model2vec}
}

avsolatorio/GIST-Embedding-v0

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.5M 418.0 MB 2024-01-31 eng-Latn
Citation
@article{solatorio2024gistembed,
    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
    author={Aivin V. Solatorio},
    journal={arXiv preprint arXiv:2402.16829},
    year={2024},
    URL={https://arxiv.org/abs/2402.16829}
    eprint={2402.16829},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

avsolatorio/GIST-all-MiniLM-L6-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 22.7M 87.0 MB 2024-02-03 eng-Latn
Citation
@article{solatorio2024gistembed,
    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
    author={Aivin V. Solatorio},
    journal={arXiv preprint arXiv:2402.16829},
    year={2024},
    URL={https://arxiv.org/abs/2402.16829}
    eprint={2402.16829},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

avsolatorio/GIST-large-Embedding-v0

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.1M 1.2 GB 2024-02-14 eng-Latn
Citation
@article{solatorio2024gistembed,
    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
    author={Aivin V. Solatorio},
    journal={arXiv preprint arXiv:2402.16829},
    year={2024},
    URL={https://arxiv.org/abs/2402.16829}
    eprint={2402.16829},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

avsolatorio/GIST-small-Embedding-v0

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 33.4M 127.0 MB 2024-02-03 eng-Latn
Citation
@article{solatorio2024gistembed,
    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
    author={Aivin V. Solatorio},
    journal={arXiv preprint arXiv:2402.16829},
    year={2024},
    URL={https://arxiv.org/abs/2402.16829}
    eprint={2402.16829},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

avsolatorio/NoInstruct-small-Embedding-v0

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 33.4M 127.0 MB 2024-05-01 eng-Latn

bedrock/amazon-titan-embed-text-v1

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1536 not specified not specified 2023-09-27 not specified

bedrock/amazon-titan-embed-text-v2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 not specified not specified 2024-04-30 not specified

bigscience/sgpt-bloom-7b1-msmarco

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 4096 not specified not specified 2022-08-26 not specified
Citation
@article{muennighoff2022sgpt,
  title={SGPT: GPT Sentence Embeddings for Semantic Search},
  author={Muennighoff, Niklas},
  journal={arXiv preprint arXiv:2202.08904},
  year={2022}
}

bisectgroup/BiCA-base

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 110.0M 418.0 MB 2025-11-14 eng-Latn
Citation
@misc{sinha2025bicaeffectivebiomedicaldense,
      title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives}, 
      author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
      year={2025},
      eprint={2511.08029},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2511.08029}, 
}

bkai-foundation-models/vietnamese-bi-encoder

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
256 768 135.0M 515.0 MB 2023-09-09 vie-Latn
Citation
      @article{duc2024towards,
    title={Towards Comprehensive Vietnamese Retrieval-Augmented Generation and Large Language Models},
    author={Nguyen Quang Duc, Le Hai Son, Nguyen Duc Nhan, Nguyen Dich Nhat Minh, Le Thanh Huong, Dinh Viet Sang},
    journal={arXiv preprint arXiv:2403.01616},
    year={2024}
  }

bm25s

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2024-07-10 eng-Latn
Citation
@misc{bm25s,
      title={BM25S: Orders of magnitude faster lexical search via eager sparse scoring},
      author={Xing Han Lù},
      year={2024},
      eprint={2407.03618},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2407.03618},
}

brahmairesearch/slx-v0.1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 22.7M 87.0 MB 2024-08-13 eng-Latn

castorini/monobert-large-msmarco

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2020-05-28 eng-Latn

castorini/monot5-3b-msmarco-10k

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2022-03-28 eng-Latn
Citation
@misc{rosa2022parameterleftbehinddistillation,
      title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
      author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
      year={2022},
      eprint={2206.02873},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2206.02873},
    }

castorini/monot5-base-msmarco-10k

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2022-03-28 eng-Latn
Citation
@misc{rosa2022parameterleftbehinddistillation,
      title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
      author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
      year={2022},
      eprint={2206.02873},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2206.02873},
    }

castorini/monot5-large-msmarco-10k

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2022-03-28 eng-Latn
Citation
@misc{rosa2022parameterleftbehinddistillation,
      title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
      author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
      year={2022},
      eprint={2206.02873},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2206.02873},
    }

castorini/monot5-small-msmarco-10k

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2022-03-28 eng-Latn
Citation
@misc{rosa2022parameterleftbehinddistillation,
      title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
      author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
      year={2022},
      eprint={2206.02873},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2206.02873},
    }

codesage/codesage-base-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 1024 356.0M 1.3 GB 2024-02-03 go-Code, java-Code, javascript-Code, php-Code, python-Code, ... (6)
Citation
@inproceedings{
    zhang2024code,
    title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
    author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
    booktitle={The Twelfth International Conference on Learning Representations},
    year={2024},
    url={https://openreview.net/forum?id=vfzRRjumpX}
}

codesage/codesage-large-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 2048 1.3B 4.8 GB 2024-02-03 go-Code, java-Code, javascript-Code, php-Code, python-Code, ... (6)
Citation
@inproceedings{
    zhang2024code,
    title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
    author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
    booktitle={The Twelfth International Conference on Learning Representations},
    year={2024},
    url={https://openreview.net/forum?id=vfzRRjumpX}
}

codesage/codesage-small-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 1024 130.0M 496.0 MB 2024-02-03 go-Code, java-Code, javascript-Code, php-Code, python-Code, ... (6)
Citation
@inproceedings{
    zhang2024code,
    title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
    author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
    booktitle={The Twelfth International Conference on Learning Representations},
    year={2024},
    url={https://openreview.net/forum?id=vfzRRjumpX}
}

cointegrated/LaBSE-en-ru

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 129.0M 492.0 MB 2021-06-10 rus-Cyrl

cointegrated/rubert-tiny

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 312 11.9M 45.0 MB 2021-05-24 rus-Cyrl

cointegrated/rubert-tiny2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 312 29.4M 112.0 MB 2021-10-28 rus-Cyrl

colbert-ir/colbertv2.0

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
180 not specified 110.0M 418.0 MB 2024-09-21 eng-Latn

consciousAI/cai-lunaris-text-embeddings

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 not specified not specified 2023-06-22 not specified

consciousAI/cai-stellaris-text-embeddings

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 not specified not specified 2023-06-23 not specified

deepfile/embedder-100p

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 not specified 1.0 GB 2023-07-24 not specified

deepvk/USER-bge-m3

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 359.0M 1.3 GB 2024-07-05 rus-Cyrl
Citation
@misc{deepvk2024user,
    title={USER: Universal Sentence Encoder for Russian},
    author={Malashenko, Boris and  Zemerov, Anton and Spirin, Egor},
    url={https://huggingface.co/datasets/deepvk/USER-base},
    publisher={Hugging Face},
    year={2024},
}

deepvk/deberta-v1-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 124.0M 473.0 MB 2023-02-07 rus-Cyrl

dmedhi/PawanEmbd-68M

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 68.0M 260.0 MB 2025-12-08 eng-Latn
Citation
@misc{medhi2025pawanembd,
    title={PawanEmbd-68M: Distilled Embedding Model},
    author={Medhi, D.},
    year={2025},
    url={https://huggingface.co/dmedhi/PawanEmbd-68M}
}

dunzhang/stella-large-zh-v3-1792d

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1792 not specified not specified 2024-02-17 zho-Hans

dunzhang/stella-mrl-large-zh-v3.5-1792d

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1792 326.0M 1.2 GB 2024-02-27 zho-Hans

dwzhu/e5-base-4k

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K not specified not specified not specified 2024-03-28 eng-Latn
Citation
@article{zhu2024longembed,
  title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
  author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
  journal={arXiv preprint arXiv:2404.12096},
  year={2024}
}

facebook/SONAR

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 not specified not specified 2021-05-21 ace-Arab, ace-Latn, acm-Arab, acq-Arab, aeb-Arab, ... (204)
Citation
@misc{Duquenne:2023:sonar_arxiv,
  author = {Paul-Ambroise Duquenne and Holger Schwenk and Benoit Sagot},
  title = {{SONAR:} Sentence-Level Multimodal and Language-Agnostic Representations},
  publisher = {arXiv},
  year = {2023},
  url = {https://arxiv.org/abs/2308.11466},
}

facebook/contriever-msmarco

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 150.0M 572.0 MB 2022-06-25 eng-Latn
Citation
    @misc{izacard2021contriever,
      title={Unsupervised Dense Information Retrieval with Contrastive Learning},
      author={Gautier Izacard and Mathilde Caron and Lucas Hosseini and Sebastian Riedel and Piotr Bojanowski and Armand Joulin and Edouard Grave},
      year={2021},
      url = {https://arxiv.org/abs/2112.09118},
      doi = {10.48550/ARXIV.2112.09118},
    }

fangxq/XYZ-embedding

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 326.0M 1.2 GB 2024-09-13 zho-Hans

google/flan-t5-base

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 944.0 MB 2022-10-21 eng-Latn
Citation
@misc{10.48550/arxiv.2210.11416,
      doi = {10.48550/ARXIV.2210.11416},
      url = {https://arxiv.org/abs/2210.11416},
      author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
      keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
      title = {Scaling Instruction-Finetuned Language Models},
      publisher = {arXiv},
      year = {2022},
      copyright = {Creative Commons Attribution 4.0 International}
    }

google/flan-t5-large

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 2.9 GB 2022-10-21 eng-Latn
Citation
@misc{10.48550/arxiv.2210.11416,
      doi = {10.48550/ARXIV.2210.11416},
      url = {https://arxiv.org/abs/2210.11416},
      author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
      keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
      title = {Scaling Instruction-Finetuned Language Models},
      publisher = {arXiv},
      year = {2022},
      copyright = {Creative Commons Attribution 4.0 International}
    }

google/flan-t5-xl

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 10.6 GB 2022-10-21 eng-Latn
Citation
@misc{10.48550/arxiv.2210.11416,
      doi = {10.48550/ARXIV.2210.11416},
      url = {https://arxiv.org/abs/2210.11416},
      author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
      keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
      title = {Scaling Instruction-Finetuned Language Models},
      publisher = {arXiv},
      year = {2022},
      copyright = {Creative Commons Attribution 4.0 International}
    }

google/flan-t5-xxl

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 42.0 GB 2022-10-21 eng-Latn
Citation
@misc{10.48550/arxiv.2210.11416,
      doi = {10.48550/ARXIV.2210.11416},
      url = {https://arxiv.org/abs/2210.11416},
      author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
      keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
      title = {Scaling Instruction-Finetuned Language Models},
      publisher = {arXiv},
      year = {2022},
      copyright = {Creative Commons Attribution 4.0 International}
    }

hiieu/halong_embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 278.0M 1.0 GB 2024-07-06 vie-Latn
Citation
@misc{HalongEmbedding,
  title={HalongEmbedding: A Vietnamese Text Embedding},
  author={Ngo Hieu},
  year={2024},
  publisher={Huggingface},
}

iampanda/zpoint_large_embedding_zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1792 326.0M 1.2 GB 2024-06-04 zho-Hans

ibm-granite/granite-embedding-107m-multilingual

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 107.0M 204.0 MB 2024-12-18 ara-Latn, ces-Latn, deu-Latn, eng-Latn, fra-Latn, ... (13)
Citation
@article{awasthy2025graniteembedding,
  title={Granite Embedding Models},
  author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
  journal={arXiv preprint arXiv:2502.20204},
  year={2025}
}

ibm-granite/granite-embedding-125m-english

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 125.0M 238.0 MB 2024-12-18 eng-Latn
Citation
@article{awasthy2025graniteembedding,
  title={Granite Embedding Models},
  author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
  journal={arXiv preprint arXiv:2502.20204},
  year={2025}
}

ibm-granite/granite-embedding-278m-multilingual

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 278.0M 530.0 MB 2024-12-18 ara-Latn, ces-Latn, deu-Latn, eng-Latn, fra-Latn, ... (13)
Citation
@article{awasthy2025graniteembedding,
  title={Granite Embedding Models},
  author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
  journal={arXiv preprint arXiv:2502.20204},
  year={2025}
}

ibm-granite/granite-embedding-30m-english

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 30.0M 58.0 MB 2024-12-18 eng-Latn
Citation
@article{awasthy2025graniteembedding,
  title={Granite Embedding Models},
  author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
  journal={arXiv preprint arXiv:2502.20204},
  year={2025}
}

ibm-granite/granite-embedding-english-r2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 149.0M 284.0 MB 2025-08-15 eng-Latn
Citation
@article{awasthy2025graniteembedding,
  title={Granite Embedding Models},
  author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
  journal={arXiv preprint arXiv:2502.20204},
  year={2025}
}

ibm-granite/granite-embedding-small-english-r2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 384 47.0M 91.0 MB 2025-08-15 eng-Latn
Citation
@article{awasthy2025graniteembedding,
  title={Granite Embedding Models},
  author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
  journal={arXiv preprint arXiv:2502.20204},
  year={2025}
}

infgrad/stella-base-en-v2

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 not specified not specified not specified 2023-10-19 eng-Latn

infgrad/stella-base-zh-v3-1792d

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1792 not specified not specified 2024-02-17 zho-Hans

izhx/udever-bloom-1b1

License: https://huggingface.co/spaces/bigscience/license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-10-24 aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45)
Citation
@article{zhang2023language,
  title={Language Models are Universal Embedders},
  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
  journal={arXiv preprint arXiv:2310.08232},
  year={2023}
}

izhx/udever-bloom-3b

License: https://huggingface.co/spaces/bigscience/license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-10-24 aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45)
Citation
@article{zhang2023language,
  title={Language Models are Universal Embedders},
  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
  journal={arXiv preprint arXiv:2310.08232},
  year={2023}
}

izhx/udever-bloom-560m

License: https://huggingface.co/spaces/bigscience/license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-10-24 aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45)
Citation
@article{zhang2023language,
  title={Language Models are Universal Embedders},
  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
  journal={arXiv preprint arXiv:2310.08232},
  year={2023}
}

izhx/udever-bloom-7b1

License: https://huggingface.co/spaces/bigscience/license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-10-24 aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45)
Citation
@article{zhang2023language,
  title={Language Models are Universal Embedders},
  author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
  journal={arXiv preprint arXiv:2310.08232},
  year={2023}
}

jhu-clsp/FollowIR-7B

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 13.5 GB 2024-04-29 eng-Latn
Citation
    @misc{weller2024followir,
      title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions},
      author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini},
      year={2024},
      eprint={2403.15246},
      archivePrefix={arXiv},
      primaryClass={cs.IR}
    }

jinaai/jina-colbert-v2

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K not specified 559.0M 1.0 GB 2024-08-16 ara-Arab, ben-Beng, deu-Latn, eng-Latn, fas-Arab, ... (22)
Citation
@inproceedings{xiao-etal-2024-jina,
    title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
    author = {Jha, Rohan  and
      Wang, Bo  and
      G{"u}nther, Michael  and
      Mastrapas, Georgios  and
      Sturua, Saba  and
      Mohr, Isabelle  and
      Koukounas, Andreas  and
      Wang, Mohammad Kalim  and
      Wang, Nan  and
      Xiao, Han},
    editor = {S{"a}lev{"a}, Jonne  and
      Owodunni, Abraham},
    booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.mrl-1.11/",
    doi = "10.18653/v1/2024.mrl-1.11",
    pages = "159--166",
    abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
}

jinaai/jina-embedding-b-en-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 110.0M 420.0 MB 2023-07-07 eng-Latn
Citation
@misc{günther2023jina,
      title={Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models},
      author={Michael Günther and Louis Milliken and Jonathan Geuter and Georgios Mastrapas and Bo Wang and Han Xiao},
      year={2023},
      eprint={2307.11224},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

jinaai/jina-embedding-s-en-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 35.0M 134.0 MB 2023-07-07 eng-Latn
Citation
@misc{günther2023jina,
      title={Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models},
      author={Michael Günther and Louis Milliken and Jonathan Geuter and Georgios Mastrapas and Bo Wang and Han Xiao},
      year={2023},
      eprint={2307.11224},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

jinaai/jina-embeddings-v2-base-en

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 137.0M 262.0 MB 2023-09-27 eng-Latn
Citation
@misc{günther2023jina,
      title={Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents},
      author={Michael Günther and Jackmin Ong and Isabelle Mohr and Alaeddine Abdessalem and Tanguy Abel and Mohammad Kalim Akram and Susana Guzman and Georgios Mastrapas and Saba Sturua and Bo Wang and Maximilian Werk and Nan Wang and Han Xiao},
      year={2023},
      eprint={2310.19923},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

jinaai/jina-embeddings-v2-small-en

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 512 32.7M 62.0 MB 2023-09-27 eng-Latn
Citation
@misc{günther2023jina,
      title={Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents},
      author={Michael Günther and Jackmin Ong and Isabelle Mohr and Alaeddine Abdessalem and Tanguy Abel and Mohammad Kalim Akram and Susana Guzman and Georgios Mastrapas and Saba Sturua and Bo Wang and Maximilian Werk and Nan Wang and Han Xiao},
      year={2023},
      eprint={2310.19923},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

jinaai/jina-reranker-v2-base-multilingual

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified 531.0 MB 2024-09-26 eng-Latn

jinaai/jina-reranker-v3

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K not specified 600.0M 1.1 GB 2025-09-18 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@misc{wang2025jinarerankerv3lateinteractionlistwise,
      title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
      author={Feng Wang and Yuqing Li and Han Xiao},
      year={2025},
      eprint={2509.25085},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2509.25085},}

keeeeenw/MicroLlama-text-embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 1024 272.0M 1.0 GB 2024-11-10 eng-Latn

lier007/xiaobu-embedding

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 326.0M 1.2 GB 2024-01-09 zho-Hans

lier007/xiaobu-embedding-v2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 326.0M 1.2 GB 2024-06-30 zho-Hans

lightonai/GTE-ModernColBERT-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K not specified 149.0M not specified 2025-04-30 eng-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084"
}

llmrails/ember-v1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.0M 1.2 GB 2023-10-10 eng-Latn
Citation
@misc{nur2024emberv1,
      title={ember-v1: SOTA embedding model}, 
      author={Enrike Nur and Anar Aliyev},
      year={2023},
}

m3hrdadfi/bert-zwnj-wnli-mean-tokens

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 118.3M 451.0 MB 2021-06-28 fas-Arab

m3hrdadfi/roberta-zwnj-wnli-mean-tokens

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 118.3M 451.0 MB 2021-06-28 fas-Arab

malenia1/ternary-weight-embedding

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 98.7M 158.0 MB 2024-10-23 not specified

manu/bge-m3-custom-fr

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 567.8M 2.1 GB 2024-04-11 not specified

manu/sentence_croissant_alpha_v0.2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 2048 1.3B 2.4 GB 2024-03-15 not specified

manu/sentence_croissant_alpha_v0.3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 2048 1.3B 2.4 GB 2024-04-26 not specified

manu/sentence_croissant_alpha_v0.4

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 2048 1.3B 2.4 GB 2024-04-27 eng-Latn, fra-Latn

meta-llama/Llama-2-7b-chat-hf

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-07-18 eng-Latn
Citation
@misc{touvron2023llama2openfoundation,
      title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
      author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
      year={2023},
      eprint={2307.09288},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2307.09288},
    }

meta-llama/Llama-2-7b-hf

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-07-18 eng-Latn
Citation
@misc{touvron2023llama2openfoundation,
      title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
      author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
      year={2023},
      eprint={2307.09288},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2307.09288},
    }

minishlab/M2V_base_glove

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 102.0M 391.0 MB 2024-09-21 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/M2V_base_glove_subword

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 103.0M 391.0 MB 2024-09-21 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/M2V_base_output

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 7.6M 29.0 MB 2024-09-21 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/M2V_multilingual_output

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 128.0M 489.0 MB 2024-09-21 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/potion-base-2M

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 64 2.0M 7.0 MB 2024-10-29 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/potion-base-4M

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 128 3.8M 14.0 MB 2024-10-29 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/potion-base-8M

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 7.6M 29.0 MB 2024-10-29 eng-Latn
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

minishlab/potion-multilingual-128M

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 128.0M 489.0 MB 2025-05-23 afr-Latn, amh-Ethi, ara-Arab, aze-Latn, bel-Cyrl, ... (101)
Citation
@software{minishlab2024model2vec,
      authors = {Stephan Tulkens, Thomas van Dongen},
      title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
      year = {2024},
      url = {https://github.com/MinishLab/model2vec}
}

mistralai/Mistral-7B-Instruct-v0.2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2023-12-11 eng-Latn
Citation
@misc{jiang2023mistral7b,
      title={Mistral 7B},
      author={Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Florian Bressand and Gianna Lengyel and Guillaume Lample and Lucile Saulnier and Lélio Renard Lavaud and Marie-Anne Lachaux and Pierre Stock and Teven Le Scao and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
      year={2023},
      eprint={2310.06825},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2310.06825},
    }

moka-ai/m3e-base

License: https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 102.0M 390.0 MB 2023-06-06 eng-Latn, zho-Hans
Citation
@software{MokaMassiveMixedEmbedding,
  author = {Wang Yuxin and Sun Qingxuan and He Sicheng},
  title = {M3E: Moka Massive Mixed Embedding Model},
  year = {2023}
}

moka-ai/m3e-large

License: https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 not specified not specified 2023-06-21 eng-Latn, zho-Hans
Citation
@software{MokaMassiveMixedEmbedding,
  author = {Wang Yuxin and Sun Qingxuan and He Sicheng},
  title = {M3E: Moka Massive Mixed Embedding Model},
  year = {2023}
}

moka-ai/m3e-small

License: https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 512 not specified not specified 2023-06-02 eng-Latn, zho-Hans
Citation
@software{MokaMassiveMixedEmbedding,
  author = {Wang Yuxin and Sun Qingxuan and He Sicheng},
  title = {M3E: Moka Massive Mixed Embedding Model},
  year = {2023}
}

myrkur/sentence-transformer-parsbert-fa

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 162.8M 621.0 MB 2024-12-10 fas-Arab

omarelshehy/arabic-english-sts-matryoshka

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 559.9M 2.1 GB 2024-10-13 ara-Arab, eng-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

openai/text-embedding-3-large

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 3072 not specified not specified 2024-01-25 not specified

openai/text-embedding-3-large (embed_dim=512)

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 512 not specified not specified 2024-01-25 not specified

openai/text-embedding-3-small

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1536 not specified not specified 2024-01-25 not specified

openai/text-embedding-3-small (embed_dim=512)

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 512 not specified not specified 2024-01-25 not specified

openai/text-embedding-ada-002

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1536 not specified not specified 2022-12-15 not specified

openbmb/MiniCPM-Embedding

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 2304 2.7B 5.1 GB 2024-09-04 eng-Latn, zho-Hans

panalexeu/xlm-roberta-ua-distilled

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 278.0M 1.0 GB 2025-04-15 eng-Latn, ukr-Cyrl
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

prdev/mini-gte

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 66.3M 253.0 MB 2025-01-28 eng-Latn

rasgaard/m2v-dfm-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 22.9M 87.0 MB 2025-10-08 dan-Latn
Citation
@article{minishlab2024model2vec,
    author = {Tulkens, Stephan and {van Dongen}, Thomas},
    title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
    year = {2024},
    url = {https://github.com/MinishLab/model2vec}
}

richinfoai/ritrieve_zh_v1

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1792 326.0M 1.2 GB 2025-03-25 zho-Hans

sbintuitions/sarashina-embedding-v1-1b

License: https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1792 1.2B 4.6 GB 2024-11-22 jpn-Jpan

sbunlp/fabert

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 124.4M 475.0 MB 2024-10-07 fas-Arab
Citation
@inproceedings{masumi-etal-2025-fabert,
    title = "{F}a{BERT}: Pre-training {BERT} on {P}ersian Blogs",
    author = "Masumi, Mostafa  and
      Majd, Seyed Soroush  and
      Shamsfard, Mehrnoush  and
      Beigy, Hamid",
    editor = "Bak, JinYeong  and
      Goot, Rob van der  and
      Jang, Hyeju  and
      Buaphet, Weerayut  and
      Ramponi, Alan  and
      Xu, Wei  and
      Ritter, Alan",
    booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
    month = may,
    year = "2025",
    address = "Albuquerque, New Mexico, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.wnut-1.10/",
    doi = "10.18653/v1/2025.wnut-1.10",
    pages = "85--96",
    ISBN = "979-8-89176-232-9",
}

sdadas/mmlw-e5-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 278.0M 1.0 GB 2023-11-17 pol-Latn
Citation
@article{dadas2024pirb,
  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods}, 
  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
  year={2024},
  eprint={2402.13350},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

sdadas/mmlw-e5-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 559.9M 2.1 GB 2023-11-17 pol-Latn
Citation
@article{dadas2024pirb,
  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods}, 
  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
  year={2024},
  eprint={2402.13350},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

sdadas/mmlw-e5-small

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 117.7M 449.0 MB 2023-11-17 pol-Latn
Citation
@article{dadas2024pirb,
  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods}, 
  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
  year={2024},
  eprint={2402.13350},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

sdadas/mmlw-roberta-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 768 124.4M 475.0 MB 2023-11-17 pol-Latn
Citation
@article{dadas2024pirb,
  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods}, 
  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
  year={2024},
  eprint={2402.13350},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

sdadas/mmlw-roberta-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
514 1024 435.0M 1.6 GB 2023-11-17 pol-Latn
Citation
@article{dadas2024pirb,
  title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods}, 
  author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
  year={2024},
  eprint={2402.13350},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

sensenova/piccolo-base-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 not specified not specified 2023-09-04 zho-Hans

sensenova/piccolo-large-zh-v2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 not specified not specified 2024-04-22 zho-Hans
Citation
@misc{2405.06932,
    Author = {Junqin Huang and Zhongjie Hu and Zihao Jing and Mengya Gao and Yichao Wu},
    Title = {Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training},
    Year = {2024},
    Eprint = {arXiv:2405.06932},
}

sentence-transformers/LaBSE

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 471.0M 1.8 GB 2019-11-01 ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (53)
Citation
@misc{feng2022languageagnosticbertsentenceembedding,
      title={Language-agnostic BERT Sentence Embedding},
      author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},
      year={2022},
      eprint={2007.01852},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2007.01852},
    }

sentence-transformers/all-MiniLM-L12-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
256 384 33.4M 127.0 MB 2021-08-30 eng-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

sentence-transformers/all-MiniLM-L6-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
256 384 22.7M 87.0 MB 2021-08-30 eng-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

sentence-transformers/all-mpnet-base-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
384 768 109.0M 418.0 MB 2021-08-30 eng-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

sentence-transformers/gtr-t5-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 110.0M 209.0 MB 2022-02-09 eng-Latn
Citation
@misc{ni2021largedualencodersgeneralizable,
      title={Large Dual Encoders Are Generalizable Retrievers}, 
      author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
      year={2021},
      eprint={2112.07899},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2112.07899}, 
}

sentence-transformers/gtr-t5-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 335.0M 639.0 MB 2022-02-09 eng-Latn
Citation
@misc{ni2021largedualencodersgeneralizable,
      title={Large Dual Encoders Are Generalizable Retrievers}, 
      author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
      year={2021},
      eprint={2112.07899},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2112.07899}, 
}

sentence-transformers/gtr-t5-xl

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 1.2B 2.3 GB 2022-02-09 eng-Latn
Citation
@misc{ni2021largedualencodersgeneralizable,
      title={Large Dual Encoders Are Generalizable Retrievers}, 
      author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
      year={2021},
      eprint={2112.07899},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2112.07899}, 
}

sentence-transformers/gtr-t5-xxl

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 4.9B 9.1 GB 2022-02-09 eng-Latn
Citation
@misc{ni2021largedualencodersgeneralizable,
      title={Large Dual Encoders Are Generalizable Retrievers}, 
      author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
      year={2021},
      eprint={2112.07899},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2112.07899}, 
}

sentence-transformers/multi-qa-MiniLM-L6-cos-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 22.7M 87.0 MB 2021-08-30 eng-Latn
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 118.0M 449.0 MB 2019-11-01 ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (53)
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

sentence-transformers/paraphrase-multilingual-mpnet-base-v2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 278.0M 1.0 GB 2019-11-01 ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (53)
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}

sentence-transformers/sentence-t5-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 110.0M 209.0 MB 2022-02-09 eng-Latn
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
      title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models}, 
      author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
      year={2021},
      eprint={2108.08877},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2108.08877}, 
}

sentence-transformers/sentence-t5-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 335.0M 639.0 MB 2022-02-09 eng-Latn
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
      title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models}, 
      author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
      year={2021},
      eprint={2108.08877},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2108.08877}, 
}

sentence-transformers/sentence-t5-xl

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 3.0B 2.3 GB 2024-03-27 eng-Latn
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
      title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models}, 
      author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
      year={2021},
      eprint={2108.08877},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2108.08877}, 
}

sentence-transformers/sentence-t5-xxl

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 11.0B 9.1 GB 2024-03-27 eng-Latn
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
      title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models}, 
      author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
      year={2021},
      eprint={2108.08877},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2108.08877}, 
}

sentence-transformers/static-similarity-mrl-multilingual-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 108.4M 413.0 MB 2025-01-15 ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (49)
Citation
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

sergeyzh/LaBSE-ru-turbo

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 129.0M 490.0 MB 2024-06-27 rus-Cyrl

sergeyzh/rubert-tiny-turbo

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 312 29.2M 111.0 MB 2024-06-21 rus-Cyrl

shibing624/text2vec-base-chinese

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 102.0M 390.0 MB 2022-01-23 zho-Hans
Citation
@software{text2vec,
  author = {Xu Ming},
  title = {text2vec: A Tool for Text to Vector},
  year = {2022},
  url = {https://github.com/shibing624/text2vec},
}

shibing624/text2vec-base-chinese-paraphrase

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 118.0M 450.0 MB 2023-06-19 zho-Hans
Citation
@software{text2vec,
  author = {Xu Ming},
  title = {text2vec: A Tool for Text to Vector},
  year = {2022},
  url = {https://github.com/shibing624/text2vec},
}

shibing624/text2vec-base-multilingual

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
256 384 117.7M 449.0 MB 2023-06-22 deu-Latn, eng-Latn, fra-Latn, ita-Latn, nld-Latn, ... (10)
Citation
@software{text2vec,
  author = {Xu Ming},
  title = {text2vec: A Tool for Text to Vector},
  year = {2022},
  url = {https://github.com/shibing624/text2vec},
}

silma-ai/silma-embeddding-matryoshka-v0.1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 135.2M 516.0 MB 2024-10-12 ara-Arab, eng-Latn
Citation
@misc{silma2024embedding,
  author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
  title = {SILMA Embedding Matryoshka 0.1},
  year = {2024},
  publisher = {Hugging Face},
  howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
}

spartan8806/atles-champion-embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 110.0M 420.0 MB 2025-11-15 eng-Latn
Citation
@article{conner2025epistemic,
  title={The Epistemic Barrier: How RLHF Makes AI Consciousness Empirically Undecidable},
  author={Conner (spartan8806)},
  journal={ATLES Research Papers},
  year={2025},
  note={Cross-model validation study (Phoenix, Grok, Gemini, Claude)}
}

thenlper/gte-base

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 109.5M 209.0 MB 2023-07-27 eng-Latn
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

thenlper/gte-base-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 102.0M 195.0 MB 2023-11-08 zho-Hans
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

thenlper/gte-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 335.1M 639.0 MB 2023-07-27 eng-Latn
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

thenlper/gte-large-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 326.0M 621.0 MB 2023-11-08 zho-Hans
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

thenlper/gte-small

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 384 33.4M 64.0 MB 2023-07-27 eng-Latn
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

thenlper/gte-small-zh

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 1024 30.3M 58.0 MB 2023-11-08 zho-Hans
Citation
@article{li2023towards,
  title={Towards general text embeddings with multi-stage contrastive learning},
  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
  journal={arXiv preprint arXiv:2308.03281},
  year={2023}
}

unicamp-dl/mt5-13b-mmarco-100k

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2022-11-04 afr-Latn, amh-Ethi, ara-Arab, aze-Latn, bel-Cyrl, ... (103)

unicamp-dl/mt5-base-mmarco-v2

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified not specified not specified not specified 2022-01-05 afr-Latn, amh-Ethi, ara-Arab, aze-Latn, bel-Cyrl, ... (103)
Citation
@misc{bonifacio2021mmarco,
      title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
      author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and  and Roberto Lotufo and Rodrigo Nogueira},
      year={2021},
      eprint={2108.13897},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
    }

w601sxs/b1ade-embed

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
4.1K 1024 335.0M 1.2 GB 2025-03-10 eng-Latn
Citation
    @misc{bigscience_workshop_2022,
    author       = { {Shreyas Subramanian} },
    title        = { {b1ade series of models} },
    year         = 2024,
    url          = { https://huggingface.co/w601sxs/b1ade-embed },
    publisher    = { Hugging Face }
}