kpfadnis's picture
chore: Initial commit.
599f646
raw
history blame
6 kB
/**
*
* Copyright 2023-2024 InspectorRAGet Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
import { StringMatchObject } from '@/src/types';
const crypto = require('crypto');
export function truncate(text: string, length: number): string {
if (text.length > length) {
return text.slice(0, length) + ' ...';
}
return text;
}
export function hash(text: string): string {
return crypto.createHash('md5').update(text).digest('hex');
}
/**
* Normalize text (e.g., characters used for quotes). Used to improve matching.
* @param text
* @returns
*/
function normalize(text) {
var normalizedText = text;
// normalize double and single quotes
normalizedText = text.replace(/[β€œβ€]/g, '"');
normalizedText = normalizedText.replace(/[β€˜β€™]/g, "'");
return normalizedText;
}
/**
* Helper functions to identify token boundaries
* getNextTokenStart: Identifies start of next token
* getNextTokenEnd: Identified end of next token
*/
/**
* Identify start of next token
* @param text
* @param offset starting offset in the text
* @returns starting position index of next token
*/
function getNextTokenStart(text: string, offset: number = 0): number {
// Step 1: Set starting index to provided offset
var startIndex = offset;
// Step 2: Skip over non-alphanumeric characters at the start
while (startIndex < text.length && /\W/.test(text.charAt(startIndex))) {
startIndex++;
}
// Step 3: Return
return startIndex;
}
/**
* Identify end of next token
* @param text
* @param offset starting offset in the text
* @returns ending position index of next token
*/
function getNextTokenEnd(text: string, offset: number = 0): number {
// Step 1: Set end index to be starting index of next token
var endIndex = getNextTokenStart(text, offset);
// Step 2: Include alphanumeric characters until the first non-alphanumeric character is found
while (endIndex < text.length && !/\W/.test(text.charAt(endIndex))) {
endIndex++;
}
return endIndex;
}
/**
* Create regular expression based on string
* @param text regular expression string
* @returns
*/
function createRegex(text: string): RegExp {
// Escape regular expression characters
const escapedText = text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return new RegExp(escapedText, 'g');
}
/**
* Find matches in the text based on query using regular expression
* @param query string to find
* @param text
* @returns
*/
function match(query: string, text: string) {
// Step 1: Create regular expression
const regex = createRegex(query);
// Step 2: Find matches
const matches: { readonly start: number; readonly end: number }[] = [];
let match;
while ((match = regex.exec(text)) !== null) {
matches.push({
start: match.index,
end: regex.lastIndex,
});
}
// Step 3: Return
return matches;
}
/**
* Find overlap based on matching tokens between source and target
* @param source string from which tokens are used to find overlap
* @param target string in which overlaps are found
* @param min_match_tokens
* @returns
*/
export function overlaps(
source: string,
target: string,
min_match_tokens: number = 3,
) {
// Step 1: Normalize source and target text
const normalizedSource = normalize(source).toLowerCase();
const normalizedTarget = normalize(target).toLowerCase();
// Step 2: Define necessary variables
const matches: StringMatchObject[] = [];
// Step 3: Find matches
// Step 3.a: Identify starting position for next token in the source and set current end position to same starting position
let curStartPos = getNextTokenStart(normalizedSource, 0);
// Step 3.b: Keep finding next starting position till all tokens in the source are seen
while (curStartPos < normalizedSource.length) {
let curEndPos = curStartPos;
let matchTokenLength = 0;
let substringEndPos = curStartPos;
// Step 3.b.i: Identify next minimum match tokens
while (matchTokenLength < min_match_tokens - 1) {
substringEndPos = getNextTokenEnd(normalizedSource, substringEndPos);
matchTokenLength++;
}
// Step 3.b.ii:
do {
// Update temporary end position
substringEndPos = getNextTokenEnd(normalizedSource, substringEndPos);
// Find matches for the source substring in the target
var matchesInTarget = normalizedTarget.match(
createRegex(normalizedSource.substring(curStartPos, substringEndPos)),
);
if (matchesInTarget != null) {
curEndPos = substringEndPos;
}
} while (matchesInTarget != null && curEndPos < normalizedSource.length);
if (curEndPos !== curStartPos) {
const localMatches: { start: number; end: number }[] = match(
normalizedSource.substring(curStartPos, curEndPos),
normalizedTarget,
);
matches.push({
start: curStartPos,
end: curEndPos,
text: normalizedSource.substring(curStartPos, curEndPos),
matchesInTarget: localMatches,
count: localMatches.length,
});
// Set current starting position to next token in the source past current ending position
curStartPos = getNextTokenStart(normalizedSource, curEndPos);
} else {
// Set current starting position to next token in the source past current ending position
curStartPos = getNextTokenStart(
normalizedSource,
getNextTokenEnd(normalizedSource, curEndPos),
);
}
}
return matches;
}