Spaces:
Running
Running
/** | |
* | |
* Copyright 2023-2024 InspectorRAGet Team | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
* | |
**/ | |
import { StringMatchObject } from '@/src/types'; | |
const crypto = require('crypto'); | |
export function truncate(text: string, length: number): string { | |
if (text.length > length) { | |
return text.slice(0, length) + ' ...'; | |
} | |
return text; | |
} | |
export function hash(text: string): string { | |
return crypto.createHash('md5').update(text).digest('hex'); | |
} | |
/** | |
* Normalize text (e.g., characters used for quotes). Used to improve matching. | |
* @param text | |
* @returns | |
*/ | |
function normalize(text) { | |
var normalizedText = text; | |
// normalize double and single quotes | |
normalizedText = text.replace(/[ββ]/g, '"'); | |
normalizedText = normalizedText.replace(/[ββ]/g, "'"); | |
return normalizedText; | |
} | |
/** | |
* Helper functions to identify token boundaries | |
* getNextTokenStart: Identifies start of next token | |
* getNextTokenEnd: Identified end of next token | |
*/ | |
/** | |
* Identify start of next token | |
* @param text | |
* @param offset starting offset in the text | |
* @returns starting position index of next token | |
*/ | |
function getNextTokenStart(text: string, offset: number = 0): number { | |
// Step 1: Set starting index to provided offset | |
var startIndex = offset; | |
// Step 2: Skip over non-alphanumeric characters at the start | |
while (startIndex < text.length && /\W/.test(text.charAt(startIndex))) { | |
startIndex++; | |
} | |
// Step 3: Return | |
return startIndex; | |
} | |
/** | |
* Identify end of next token | |
* @param text | |
* @param offset starting offset in the text | |
* @returns ending position index of next token | |
*/ | |
function getNextTokenEnd(text: string, offset: number = 0): number { | |
// Step 1: Set end index to be starting index of next token | |
var endIndex = getNextTokenStart(text, offset); | |
// Step 2: Include alphanumeric characters until the first non-alphanumeric character is found | |
while (endIndex < text.length && !/\W/.test(text.charAt(endIndex))) { | |
endIndex++; | |
} | |
return endIndex; | |
} | |
/** | |
* Create regular expression based on string | |
* @param text regular expression string | |
* @returns | |
*/ | |
function createRegex(text: string): RegExp { | |
// Escape regular expression characters | |
const escapedText = text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | |
return new RegExp(escapedText, 'g'); | |
} | |
/** | |
* Find matches in the text based on query using regular expression | |
* @param query string to find | |
* @param text | |
* @returns | |
*/ | |
function match(query: string, text: string) { | |
// Step 1: Create regular expression | |
const regex = createRegex(query); | |
// Step 2: Find matches | |
const matches: { readonly start: number; readonly end: number }[] = []; | |
let match; | |
while ((match = regex.exec(text)) !== null) { | |
matches.push({ | |
start: match.index, | |
end: regex.lastIndex, | |
}); | |
} | |
// Step 3: Return | |
return matches; | |
} | |
/** | |
* Find overlap based on matching tokens between source and target | |
* @param source string from which tokens are used to find overlap | |
* @param target string in which overlaps are found | |
* @param min_match_tokens | |
* @returns | |
*/ | |
export function overlaps( | |
source: string, | |
target: string, | |
min_match_tokens: number = 3, | |
) { | |
// Step 1: Normalize source and target text | |
const normalizedSource = normalize(source).toLowerCase(); | |
const normalizedTarget = normalize(target).toLowerCase(); | |
// Step 2: Define necessary variables | |
const matches: StringMatchObject[] = []; | |
// Step 3: Find matches | |
// Step 3.a: Identify starting position for next token in the source and set current end position to same starting position | |
let curStartPos = getNextTokenStart(normalizedSource, 0); | |
// Step 3.b: Keep finding next starting position till all tokens in the source are seen | |
while (curStartPos < normalizedSource.length) { | |
let curEndPos = curStartPos; | |
let matchTokenLength = 0; | |
let substringEndPos = curStartPos; | |
// Step 3.b.i: Identify next minimum match tokens | |
while (matchTokenLength < min_match_tokens - 1) { | |
substringEndPos = getNextTokenEnd(normalizedSource, substringEndPos); | |
matchTokenLength++; | |
} | |
// Step 3.b.ii: | |
do { | |
// Update temporary end position | |
substringEndPos = getNextTokenEnd(normalizedSource, substringEndPos); | |
// Find matches for the source substring in the target | |
var matchesInTarget = normalizedTarget.match( | |
createRegex(normalizedSource.substring(curStartPos, substringEndPos)), | |
); | |
if (matchesInTarget != null) { | |
curEndPos = substringEndPos; | |
} | |
} while (matchesInTarget != null && curEndPos < normalizedSource.length); | |
if (curEndPos !== curStartPos) { | |
const localMatches: { start: number; end: number }[] = match( | |
normalizedSource.substring(curStartPos, curEndPos), | |
normalizedTarget, | |
); | |
matches.push({ | |
start: curStartPos, | |
end: curEndPos, | |
text: normalizedSource.substring(curStartPos, curEndPos), | |
matchesInTarget: localMatches, | |
count: localMatches.length, | |
}); | |
// Set current starting position to next token in the source past current ending position | |
curStartPos = getNextTokenStart(normalizedSource, curEndPos); | |
} else { | |
// Set current starting position to next token in the source past current ending position | |
curStartPos = getNextTokenStart( | |
normalizedSource, | |
getNextTokenEnd(normalizedSource, curEndPos), | |
); | |
} | |
} | |
return matches; | |
} | |