feat: basic levensthein

NadhifRadityo · SayyakuHajime · commit 41d0b5a9ca4a · 2025-06-15T21:00:31.000+07:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 venv/
 __pycache__
+.private
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -15,6 +15,7 @@
         {
             "name": "generate_section_names.mjs",
             "program": "${workspaceFolder}/src/generate_section_names.mjs",
+            "cwd": "${workspaceFolder}/src",
             "request": "launch",
             "skipFiles": [
                 "<node_internals>/**"
diff --git a/src/main.py b/src/main.py
@@ -272,8 +272,14 @@ def __init__(self):
 
         self.setLayout(self.main_layout)
 
-    def update(self, data, match_count):
-        self.primary_label.setText(f'<b>{data["first_name"]} {data["last_name"]}</b> – {data["application_role"]} ({match_count} keywords matched)')
+    def update(self, data, exact_match_score, distance_match_score):
+        exact_match_count = exact_match_score
+        distance_match_count = sum(len(v) for v in data["result"]["keywords_distance"].values())
+        match_description = ", ".join([v for v in [
+            (f"{exact_match_count} exact keywords matched" if exact_match_count > 0 else None),
+            (f"{distance_match_count} distance keywords matched" if distance_match_count > 0 else None)
+        ] if v is not None])
+        self.primary_label.setText(f'<b>{data["first_name"]} {data["last_name"]}</b> – {data["application_role"]}\n<i>{match_description}</i>')
         self.data = data
     
     @asyncSlot()
@@ -502,14 +508,11 @@ async def search_button_clicked(self):
         cv_cards = []
         @debounce(0.05, 0.1)
         async def update_ui():
-            results_len = len(results)
-            if results_len == 0:
-                results_len = sys.float_info.min
             time_elapsed = (time.monotonic_ns() - time_start) / 1000000
-            time_text_adjusted = time_text / 1000000 / len(results)
-            time_keywords_exact_adjusted = time_keywords_exact / 1000000 / len(results)
-            time_keywords_distance_adjusted = time_keywords_distance / 1000000 / len(results)
-            time_summary_adjusted = time_summary / 1000000 / len(results)
+            time_text_adjusted = time_text / 1000000 / len(results) if len(results) > 0 else 0
+            time_keywords_exact_adjusted = time_keywords_exact / 1000000 / len(results) if len(results) > 0 else 0
+            time_keywords_distance_adjusted = time_keywords_distance / 1000000 / len(results) if len(results) > 0 else 0
+            time_summary_adjusted = time_summary / 1000000 / len(results) if len(results) > 0 else 0
             time_total_adjusted = time_text_adjusted + time_keywords_exact_adjusted + time_keywords_distance_adjusted + time_summary_adjusted
             if time_total_adjusted == 0:
                 time_total_adjusted = sys.float_info.min
@@ -522,22 +525,22 @@ async def update_ui():
                 f"Time Total: {time_total_adjusted:.2f}ms/file ({time_total_adjusted / time_total_adjusted * time_elapsed / 1000:.2f}s cumulative)"
             )
             entries = sorted(
-                [(r, sum(len(v) for v in r["result"]["keywords_exact"].values())) for r in results if sum(len(v) for v in r["result"]["keywords_exact"].values()) > 0],
-                key=lambda e: e[1],
+                [(r, sum(len(v) for v in r["result"]["keywords_exact"].values()), sum(sum(s[2] for s in v) for v in r["result"]["keywords_distance"].values())) for r in results],
+                key=lambda e: 0.7 * e[1] + 0.3 * e[2],
                 reverse=True
             )[:top_count]
             i = 0
-            for entry, match_count in entries:
+            for entry, exact_match_score, distance_match_score in entries:
                 if i >= len(cv_cards):
                     cv_card = CVCard()
                     self.result_area.addWidget(cv_card)
                     cv_cards.append(cv_card)
                 cv_card = cv_cards[i]
-                cv_card.update(entry, match_count)
+                cv_card.update(entry, exact_match_score, distance_match_score)
                 i += 1
         async with self.database.execute("SELECT d.id, d.applicant_id, d.application_role, d.cv_path, p.first_name, p.last_name, p.date_of_birth, p.address, p.phone_number FROM application_details d JOIN applicant_profiles p ON p.id = d.applicant_id") as cursor:
             async for id, applicant_id, application_role, cv_path, first_name, last_name, date_of_birth, address, phone_number in cursor:
-                while len(futures) >= 16:
+                while len(futures) >= 32:
                     wait_event = asyncio.Event()
                     await wait_event.wait()
                 data = {
@@ -659,8 +662,8 @@ def cv_extract_information(data: dict, keywords: set[str], algo_exact: str):
     time_keywords_distance = time.monotonic_ns()
     summary = extract_cv_summary(text)
     time_summary = time.monotonic_ns()
-    with open(data["cv_path"] + ".txt", "w", encoding="utf-8") as f:
-        f.write(text)
+    # with open(data["cv_path"] + ".txt", "w", encoding="utf-8") as f:
+    #     f.write(text)
     return {
         "time_start": time_start,
         "text": text,
@@ -698,8 +701,50 @@ def text_keywords_exact_aco(text: str, keywords: set[str]):
     matcher = AhoCorasickMatcher()
     return matcher.search_patterns(text, list(keywords))
 
+_levenshtein_distance_cache = {}
+def levenshtein_distance(a: str, b: str) -> int:
+    key = (a, b)
+    if key in _levenshtein_distance_cache:
+        return _levenshtein_distance_cache[key]
+    if len(_levenshtein_distance_cache) >= 2048:
+        _levenshtein_distance_cache.pop(next(iter(_levenshtein_distance_cache)))
+    m, n = len(a), len(b)
+    prev_row = list(range(n + 1))
+    curr_row = [0] * (n + 1)
+    for i in range(1, m + 1):
+        curr_row[0] = i
+        for j in range(1, n + 1):
+            cost = 0 if a[i - 1] == b[j - 1] else 1
+            curr_row[j] = min(
+                prev_row[j] + 1,
+                curr_row[j - 1] + 1,
+                prev_row[j - 1] + cost
+            )
+        prev_row, curr_row = curr_row, prev_row
+    result = prev_row[n]
+    _levenshtein_distance_cache[key] = result
+    return result
+
 def text_keywords_distance_levenshtein(text: str, keywords: set[str]):
-    pass
+    text = re.sub(r'\s+', ' ', text).lower()
+    words = text.split()
+    result = {}
+    distance_threshold = 2
+    word_len_min = 4
+    for keyword in keywords:
+        collects = []
+        result[keyword] = collects
+        if len(keyword) <= word_len_min:
+            continue
+        distances = [
+            (i, word, levenshtein_distance(word, keyword))
+            for i, word in enumerate(words) if len(word) > word_len_min
+        ]
+        for index, word, dist in distances:
+            if dist >= distance_threshold:
+                continue
+            collects.append((index, len(word), (distance_threshold - dist) / distance_threshold))
+    return result
 
 def extract_cv_summary(text: str):
     pass

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`venv/`
`2`	`2`	`__pycache__`
	`3`	`+.private`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`{`
`16`	`16`	`"name": "generate_section_names.mjs",`
`17`	`17`	`"program": "${workspaceFolder}/src/generate_section_names.mjs",`
	`18`	`+ "cwd": "${workspaceFolder}/src",`
`18`	`19`	`"request": "launch",`
`19`	`20`	`"skipFiles": [`
`20`	`21`	`"<node_internals>/**"`