File size: 7,959 Bytes
17e77ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import math
import streamlit as st
from utils import geoutil
import pickle


def update_entities(doc, entity_texts, replace=True):
    """
    根据给定的文本内容标注实体,并直接修改 doc.ents。

    :param doc: spaCy 解析后的 Doc 对象
    :param entity_texts: 字典,键是要标注的实体文本,值是对应的实体类别
    :param replace: 布尔值,True 则替换现有实体,False 则保留现有实体并添加新的
    """
    new_ents = list(doc.ents) if not replace else []  # 如果 replace=False,保留已有实体

    for ent_text, ent_label in entity_texts.items():
        start = doc.text.find(ent_text)  # 在全文中查找文本位置
        if start != -1:
            start_token = len(doc.text[:start].split())  # 计算起始 token 索引
            end_token = start_token + len(ent_text.split())  # 计算结束 token 索引

            if start_token < len(doc) and end_token <= len(doc):  # 确保索引不越界
                new_ent = Span(doc, start_token, end_token, label=ent_label)
                new_ents.append(new_ent)

    doc.set_ents(new_ents)  # 更新 doc.ents


# def midpoint(x1, y1, x2, y2, angle):
def midpoint(y1, x1, y2, x2, angle):

    lonA = math.radians(y1)
    lonB = math.radians(y2)
    latA = math.radians(x1)
    latB = math.radians(x2)

    dLon = lonB - lonA

    Bx = math.cos(latB) * math.cos(dLon)
    By = math.cos(latB) * math.sin(dLon)

    latC = math.atan2(math.sin(latA) + math.sin(latB),
                      math.sqrt((math.cos(latA) + Bx) * (math.cos(latA) + Bx) + By * By))
    lonC = lonA + math.atan2(By, math.cos(latA) + Bx)
    lonC = (lonC + 3 * math.pi) % (2 * math.pi) - math.pi
    latitude = round(math.degrees(latC), 8)
    longitude = round(math.degrees(lonC) ,8)

    return [longitude, latitude, angle

    ]


def get_midmid_point(centroid, point1, point2, is_midmid):
    mid1 = midpoint(centroid[0], centroid[1],
                            point1[0], point1[1]
                            , point1[2])
    mid2 = midpoint(centroid[0], centroid[1],
                            point2[0], point2[1],
                            point2[2])
    midmid1 = midpoint(centroid[0], centroid[1],
                            mid1[0], mid1[1]
                            , mid1[2])
    midmid2 = midpoint(centroid[0], centroid[1],
                            mid2[0], mid2[1],
                            mid2[2])
    if is_midmid:
        return midmid1, midmid2
    else:
        return mid1, mid2




import spacy
from spacy.language import Language
import regex_spatial
from spacy.tokens import Span, Doc, Token
import re
import llm_ent_extract


rse_id = "rse_id"
def set_extension():
    Span.set_extension(rse_id, default="", force=True)
    Doc.set_extension(rse_id, default="", force=True)
    Token.set_extension(rse_id, default="", force=True)
def find_ent_by_regex(doc, sentence, ent, regex):
  global id

  if id == "":
      id = ent.text
  for match in re.finditer(regex, doc.text):
        start, end = match.span()
        if(start>= sentence.start_char and start<= sentence.end_char):
          span = doc.char_span(start, end)
          if span is not None:
            id = span.text +"_"+ id
            if(start > ent.end_char):
              ent.end_char = end
            else:
              ent.start_char = start

          return ent

  return ent
def get_level1(doc, sentence, ent):
    return find_ent_by_regex(doc, sentence, ent, regex_spatial.get_level1_regex())

def get_level2(doc, sentence, ent):
  return find_ent_by_regex(doc, sentence, ent, regex_spatial.get_level2_regex())

def get_level3(doc, sentence, ent):
  return find_ent_by_regex(doc, sentence, ent, regex_spatial.get_level3_regex())

def get_relative_entity(doc, sentence, ent):
    global id
    id = ""
    rel_entity = get_level1(doc, sentence, ent)

    rel_entity = get_level2(doc, sentence, rel_entity)

    rel_entity = get_level3(doc, sentence, rel_entity)

    # print(id)
    if ("_" in id):

        rel_entity = doc.char_span(rel_entity.start_char, rel_entity.end_char, "RSE")
        rel_entity._.rse_id = id

        return rel_entity

    rel_entity = doc.char_span(ent.start_char, ent.end_char, ent.label_)
    rel_entity._.rse_id = id
    return rel_entity


@Language.component("spatial_pipeline")
def get_spatial_ent(doc):
  set_extension()
  new_ents = []

  ents = [ent for ent in doc.ents if ent.label_ == "GPE" or ent.label_ == "LOC"]

  # GPE = '[###5###]'                       # LLM 输出的实体
  # GPE = llm_ent_extract.extract(GPE, 'LOC')
  #
  # update_entities(doc, GPE, True)
  # ents = doc.ents


  # GPE = llm_ent_extract.extract(llm_ent_extract.extract_GPE(doc.text), 'gpe')
  # update_entities(doc, GPE)

  end = None
  for ent in ents:
    if ent.end != len(doc):
        next_token = doc[ent.end]           # 怀疑多加了一个索引。Between Burwood and Pyrmont city. 分别是Pyrmont 和 .
        if end is not None:                     # end 在4次循环中是0,2,5,8
          start = end
        else:
          start = ent.sent.start                # 似乎永远都是0
        if next_token.text.lower() in regex_spatial.get_keywords():
          end = next_token.i
        else:
          end = ent.end
    rsi_ent = get_relative_entity(doc,Span(doc, start, end), ent)
    # print(rsi_ent.text, rsi_ent.label_, rsi_ent._.rse_id, '```')
    new_ents.append(rsi_ent)

  doc.ents = new_ents


  return doc
gpe_selected = "GPE"
loc_selected = "LOC"
rse_selected = "RSE"

def set_selected_entities(doc):
    global gpe_selected, loc_selected, rse_selected
    ents = [ent for ent in doc.ents if ent.label_ == gpe_selected or ent.label_ == loc_selected or ent.label_ == rse_selected]

    doc.ents = ents

    return doc

# text = 'Sydney is 6 kilometres to the east.'
def extract_spatial_entities(text):


    nlp = spacy.load("en_core_web_md")                                  #####
    # nlp.add_pipe("spatial_pipeline", after="ner")
    doc = nlp(text)

    nlp.add_pipe("spatial_pipeline", after="ner")

    # 分句处理
    sent_ents = []
    sent_texts = []
    offset = 0  # 记录当前 token 偏移量

    for sent in doc.sents:

        sent_doc = nlp(sent.text)  # 逐句处理

        sent_doc = set_selected_entities(sent_doc)  # 这里处理实体

        sent_texts.append(sent_doc.text)



        # **调整每个实体的索引,使其匹配完整文本**
        for ent in sent_doc.ents:
            new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
            sent_ents.append(new_ent)

        offset += len(sent)  # 更新偏移量

    # **创建新 Doc**
    final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])

    # **设置实体**
    final_doc.set_ents(sent_ents)
    # 分句处理完毕





    print('-' * 50)
    # print(doc.text)
    # print(doc.ents)
    # print("修改后实体:", [(ent.text, ent.label_) for ent in doc.ents])
    print("修改后实体:", [(ent.text, ent.label_) for ent in final_doc.ents])

    # print(doc.ents[0]._.rse_id, 'final_entO')
    # print(final_doc.ents[0]._.rse_id, 'final_entO')
    final_doc.ents[0]._.rse_id = '11'
    print(final_doc.ents[0]._.rse_id, 'final_entO')
    print(final_doc.ents[0].sent, 'final_entO')
    # # print(doc.sents)

    final_doc.to_disk("saved_doc.spacy")
    print("Doc saved successfully!")


text = 'Between Burwood and Pyrmont. Between Burwood and Pyrmont city.'
text = 'Between Burwood and Pyrmont.'
text = "New York is north of Washington. Between Burwood and Pyrmont city."
text = "5 km east of Burwood."

extract_spatial_entities(text)

nlp = spacy.load("en_core_web_md")
doc = Doc(nlp.vocab).from_disk("saved_doc.spacy")

print("修改后实体:", [(ent.text, ent.label_) for ent in doc.ents])
print(doc.ents[0]._.rse_id, 'final_entO')
# print(doc.ents[0].sent, 'final_entO')