File size: 1,761 Bytes
5a60200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import spacy
import pytextrank
from math import sqrt
from operator import itemgetter

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('textrank')


def _phrase_vector(doc):
    phrase_id = 0
    unit_vector = []
    sent_bounds = [[s.start, s.end, set([])] for s in doc.sents]

    for p in doc._.phrases:
        unit_vector.append(p.rank)
        for chunk in p.chunks:
            for sent_start, sent_end, sent_vector in sent_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

    sum_ranks = sum(unit_vector)
    return [rank / sum_ranks for rank in unit_vector], sent_bounds


def _sent_rank(unit_vector, sent_bounds):
    sent_rank = {}
    sent_id = 0

    for sent_start, sent_end, sent_vector in sent_bounds:
        sum_sq = 0.0
        for phrase_id in range(len(unit_vector)):
            if phrase_id not in sent_vector:
                sum_sq += unit_vector[phrase_id] ** 2.0

        sent_rank[sent_id] = sqrt(sum_sq)
        sent_id += 1
    return sent_rank


def _rank_to_summary(sent_rank, doc, summary_lines):
    sent_text = {}
    sent_id = 0

    for sent in doc.sents:
        sent_text[sent_id] = sent.text
        sent_id += 1

    summary = []
    num_sent = 0
    for sent_id, _ in sent_rank:
        num_sent += 1
        summary.append(sent_text[sent_id])
        if num_sent == summary_lines:
            break

    return ' '.join(summary)


def summarize(text, summary_lines):
    doc = nlp(text)
    phrase_vector, sent_bounds = _phrase_vector(doc)
    sent_rank  = sorted(_sent_rank(phrase_vector, sent_bounds).items(), key=itemgetter(1))
    return _rank_to_summary(sent_rank, doc, summary_lines)