make regexe patterns constants
Browse files- logmetric.py +12 -15
logmetric.py
CHANGED
|
@@ -20,7 +20,13 @@ import dateutil.parser
|
|
| 20 |
import numpy as np
|
| 21 |
from typing import List, Dict, Any
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# TODO: Add BibTeX citation
|
| 26 |
_CITATION = """\
|
|
@@ -112,17 +118,8 @@ class LogMetric(evaluate.Metric):
|
|
| 112 |
|
| 113 |
|
| 114 |
class PredRefScore:
|
| 115 |
-
# Constant regex to get timestrings
|
| 116 |
-
timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
|
| 117 |
-
timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
|
| 118 |
-
|
| 119 |
-
int_pattern = re.compile(r'(-?\d+)')
|
| 120 |
-
float_pattern = re.compile(r'(-?\d+\.\d+)')
|
| 121 |
-
|
| 122 |
scores : Dict[str, float]= {}
|
| 123 |
|
| 124 |
-
sacrebleu_metric = evaluate.load("evaluate-metric/sacrebleu")
|
| 125 |
-
|
| 126 |
def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
|
| 127 |
self.reference = reference.strip(' \t\n\r')
|
| 128 |
self.prediction = prediction.strip(' \t\n\r')
|
|
@@ -173,14 +170,14 @@ class PredRefScore:
|
|
| 173 |
|
| 174 |
# Replaces numbers in a string with a placeholder
|
| 175 |
def replaceNumbers(self, text : str) -> str:
|
| 176 |
-
text =
|
| 177 |
-
text =
|
| 178 |
return text
|
| 179 |
|
| 180 |
# Split all log-entries in timestamps and log-messages
|
| 181 |
def split_log_entry(self, pred : str, ref: str):
|
| 182 |
-
pred_split_log =
|
| 183 |
-
ref_split_log =
|
| 184 |
|
| 185 |
# One logentry always consists of timestamp + log-message
|
| 186 |
pred_timestamps, pred_logMessages = [], []
|
|
@@ -212,7 +209,7 @@ class PredRefScore:
|
|
| 212 |
self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
|
| 213 |
|
| 214 |
def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
| 215 |
-
sacrebleu_score =
|
| 216 |
self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
|
| 217 |
|
| 218 |
def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
|
@@ -223,7 +220,7 @@ class PredRefScore:
|
|
| 223 |
vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
|
| 224 |
cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
|
| 225 |
cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
|
| 226 |
-
sacrebleu_withoutExplicitNumbers_score =
|
| 227 |
self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
|
| 228 |
|
| 229 |
# Get differenct scores regarding the content of a log-message
|
|
|
|
| 20 |
import numpy as np
|
| 21 |
from typing import List, Dict, Any
|
| 22 |
|
| 23 |
+
# Constant regex to get timestrings
|
| 24 |
+
timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
|
| 25 |
+
TIMESTAMP_PATTERN = re.compile(timestamp_regex, re.MULTILINE)
|
| 26 |
|
| 27 |
+
INT_PATTERN = re.compile(r'(-?\d+)')
|
| 28 |
+
FLOAT_PATTERN = re.compile(r'(-?\d+\.\d+)')
|
| 29 |
+
SACREBLEU_METRIC = evaluate.load("evaluate-metric/sacrebleu")
|
| 30 |
|
| 31 |
# TODO: Add BibTeX citation
|
| 32 |
_CITATION = """\
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
class PredRefScore:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
scores : Dict[str, float]= {}
|
| 122 |
|
|
|
|
|
|
|
| 123 |
def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
|
| 124 |
self.reference = reference.strip(' \t\n\r')
|
| 125 |
self.prediction = prediction.strip(' \t\n\r')
|
|
|
|
| 170 |
|
| 171 |
# Replaces numbers in a string with a placeholder
|
| 172 |
def replaceNumbers(self, text : str) -> str:
|
| 173 |
+
text = INT_PATTERN.sub(r'<|INT|>', text)
|
| 174 |
+
text = FLOAT_PATTERN.sub(r'<|FLOAT|>', text)
|
| 175 |
return text
|
| 176 |
|
| 177 |
# Split all log-entries in timestamps and log-messages
|
| 178 |
def split_log_entry(self, pred : str, ref: str):
|
| 179 |
+
pred_split_log = TIMESTAMP_PATTERN.split(pred)
|
| 180 |
+
ref_split_log = TIMESTAMP_PATTERN.split(ref)
|
| 181 |
|
| 182 |
# One logentry always consists of timestamp + log-message
|
| 183 |
pred_timestamps, pred_logMessages = [], []
|
|
|
|
| 209 |
self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
|
| 210 |
|
| 211 |
def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
| 212 |
+
sacrebleu_score = SACREBLEU_METRIC.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
|
| 213 |
self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
|
| 214 |
|
| 215 |
def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
|
|
|
|
| 220 |
vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
|
| 221 |
cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
|
| 222 |
cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
|
| 223 |
+
sacrebleu_withoutExplicitNumbers_score = SACREBLEU_METRIC.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
|
| 224 |
self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score
|
| 225 |
|
| 226 |
# Get differenct scores regarding the content of a log-message
|