File size: 2,681 Bytes
870ab6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import spacy
import numpy as np
import os
from zhconv import convert
import re
import random

# добавьте специфическую для русского языка модель
import ru_core_news_sm

def detect_lang(text):
    # 定义语言占比字典
    lang_dict = {'zh-cn': 0, 'zh-tw': 0, 'en': 0, 'ru': 0, 'other': 0} # добавьте русский язык
    # 随机抽样最多十个字符
    sample = random.sample(text, min(10, len(text)))
    # 计算每种语言的字符占比
    for char in sample:
        if re.search(r'[\u4e00-\u9fa5]', char):
            lang_dict['zh-cn'] += 1
        elif re.search(r'[\u4e00-\u9fff]', char):
            lang_dict['zh-tw'] += 1
        elif re.search(r'[a-zA-Z]', char):
            lang_dict['en'] += 1
        elif re.search(r'[а-яА-Я]', char): # добавьте соответствующий диапазон для русских букв
            lang_dict['ru'] += 1
        else:
            lang_dict['other'] += 1
    # 返回占比最高的语言
    return max(lang_dict, key=lang_dict.get)

class embedding_processing:

    def __init__(self, model_path='./model'):
        self.en_model = spacy.load('en_core_web_sm')
        self.zh_model = spacy.load('zh_core_web_sm')
        self.ru_model = ru_core_news_sm.load() # добавьте модель для русского языка

    def model(self,text):
        lang = detect_lang(text)
        if lang == "zh-tw":
            ans_cn = self.zh_model(convert(text)).vector.tolist()
        else:
            ans_cn = self.zh_model(text).vector.tolist()
        ans = self.en_model(text).vector.tolist()
        return ans_cn+ans

    def embedding(self, text_list):
        embeddings_list = [self.model(text) for text in text_list]
        response_embedding = self.transform_embedding_to_dict(embeddings_list,text_list)
        return response_embedding

    def transform_embedding_to_dict(self, embedding_list, text_list, model_name="text-embedding-elmo-002"):
        prompt_tokens = sum(len(text) for text in text_list)
        total_tokens = sum(len(embedding) for embedding in embedding_list)

        transformed_data = {
            "data": [
                {
                    "embedding": embedding,
                    "index": index,
                    "object": "embedding"
                }
                for index, embedding in enumerate(embedding_list)
            ],
            "model": model_name,
            "object": "list",
            "usage": {
                "prompt_tokens": prompt_tokens,
                "total_tokens": total_tokens
            }
        }
        return transformed_data