Skip to content

Commit 41d0b5a

Browse files
NadhifRadityoSayyakuHajime
authored andcommitted
feat: basic levensthein
1 parent 53148d8 commit 41d0b5a

File tree

3 files changed

+64
-17
lines changed

3 files changed

+64
-17
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
venv/
22
__pycache__
3+
.private

.vscode/launch.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
{
1616
"name": "generate_section_names.mjs",
1717
"program": "${workspaceFolder}/src/generate_section_names.mjs",
18+
"cwd": "${workspaceFolder}/src",
1819
"request": "launch",
1920
"skipFiles": [
2021
"<node_internals>/**"

src/main.py

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,14 @@ def __init__(self):
272272

273273
self.setLayout(self.main_layout)
274274

275-
def update(self, data, match_count):
276-
self.primary_label.setText(f'<b>{data["first_name"]} {data["last_name"]}</b> – {data["application_role"]} ({match_count} keywords matched)')
275+
def update(self, data, exact_match_score, distance_match_score):
276+
exact_match_count = exact_match_score
277+
distance_match_count = sum(len(v) for v in data["result"]["keywords_distance"].values())
278+
match_description = ", ".join([v for v in [
279+
(f"{exact_match_count} exact keywords matched" if exact_match_count > 0 else None),
280+
(f"{distance_match_count} distance keywords matched" if distance_match_count > 0 else None)
281+
] if v is not None])
282+
self.primary_label.setText(f'<b>{data["first_name"]} {data["last_name"]}</b> – {data["application_role"]}\n<i>{match_description}</i>')
277283
self.data = data
278284

279285
@asyncSlot()
@@ -502,14 +508,11 @@ async def search_button_clicked(self):
502508
cv_cards = []
503509
@debounce(0.05, 0.1)
504510
async def update_ui():
505-
results_len = len(results)
506-
if results_len == 0:
507-
results_len = sys.float_info.min
508511
time_elapsed = (time.monotonic_ns() - time_start) / 1000000
509-
time_text_adjusted = time_text / 1000000 / len(results)
510-
time_keywords_exact_adjusted = time_keywords_exact / 1000000 / len(results)
511-
time_keywords_distance_adjusted = time_keywords_distance / 1000000 / len(results)
512-
time_summary_adjusted = time_summary / 1000000 / len(results)
512+
time_text_adjusted = time_text / 1000000 / len(results) if len(results) > 0 else 0
513+
time_keywords_exact_adjusted = time_keywords_exact / 1000000 / len(results) if len(results) > 0 else 0
514+
time_keywords_distance_adjusted = time_keywords_distance / 1000000 / len(results) if len(results) > 0 else 0
515+
time_summary_adjusted = time_summary / 1000000 / len(results) if len(results) > 0 else 0
513516
time_total_adjusted = time_text_adjusted + time_keywords_exact_adjusted + time_keywords_distance_adjusted + time_summary_adjusted
514517
if time_total_adjusted == 0:
515518
time_total_adjusted = sys.float_info.min
@@ -522,22 +525,22 @@ async def update_ui():
522525
f"Time Total: {time_total_adjusted:.2f}ms/file ({time_total_adjusted / time_total_adjusted * time_elapsed / 1000:.2f}s cumulative)"
523526
)
524527
entries = sorted(
525-
[(r, sum(len(v) for v in r["result"]["keywords_exact"].values())) for r in results if sum(len(v) for v in r["result"]["keywords_exact"].values()) > 0],
526-
key=lambda e: e[1],
528+
[(r, sum(len(v) for v in r["result"]["keywords_exact"].values()), sum(sum(s[2] for s in v) for v in r["result"]["keywords_distance"].values())) for r in results],
529+
key=lambda e: 0.7 * e[1] + 0.3 * e[2],
527530
reverse=True
528531
)[:top_count]
529532
i = 0
530-
for entry, match_count in entries:
533+
for entry, exact_match_score, distance_match_score in entries:
531534
if i >= len(cv_cards):
532535
cv_card = CVCard()
533536
self.result_area.addWidget(cv_card)
534537
cv_cards.append(cv_card)
535538
cv_card = cv_cards[i]
536-
cv_card.update(entry, match_count)
539+
cv_card.update(entry, exact_match_score, distance_match_score)
537540
i += 1
538541
async with self.database.execute("SELECT d.id, d.applicant_id, d.application_role, d.cv_path, p.first_name, p.last_name, p.date_of_birth, p.address, p.phone_number FROM application_details d JOIN applicant_profiles p ON p.id = d.applicant_id") as cursor:
539542
async for id, applicant_id, application_role, cv_path, first_name, last_name, date_of_birth, address, phone_number in cursor:
540-
while len(futures) >= 16:
543+
while len(futures) >= 32:
541544
wait_event = asyncio.Event()
542545
await wait_event.wait()
543546
data = {
@@ -659,8 +662,8 @@ def cv_extract_information(data: dict, keywords: set[str], algo_exact: str):
659662
time_keywords_distance = time.monotonic_ns()
660663
summary = extract_cv_summary(text)
661664
time_summary = time.monotonic_ns()
662-
with open(data["cv_path"] + ".txt", "w", encoding="utf-8") as f:
663-
f.write(text)
665+
# with open(data["cv_path"] + ".txt", "w", encoding="utf-8") as f:
666+
# f.write(text)
664667
return {
665668
"time_start": time_start,
666669
"text": text,
@@ -698,8 +701,50 @@ def text_keywords_exact_aco(text: str, keywords: set[str]):
698701
matcher = AhoCorasickMatcher()
699702
return matcher.search_patterns(text, list(keywords))
700703

704+
_levenshtein_distance_cache = {}
705+
def levenshtein_distance(a: str, b: str) -> int:
706+
key = (a, b)
707+
if key in _levenshtein_distance_cache:
708+
return _levenshtein_distance_cache[key]
709+
if len(_levenshtein_distance_cache) >= 2048:
710+
_levenshtein_distance_cache.pop(next(iter(_levenshtein_distance_cache)))
711+
m, n = len(a), len(b)
712+
prev_row = list(range(n + 1))
713+
curr_row = [0] * (n + 1)
714+
for i in range(1, m + 1):
715+
curr_row[0] = i
716+
for j in range(1, n + 1):
717+
cost = 0 if a[i - 1] == b[j - 1] else 1
718+
curr_row[j] = min(
719+
prev_row[j] + 1,
720+
curr_row[j - 1] + 1,
721+
prev_row[j - 1] + cost
722+
)
723+
prev_row, curr_row = curr_row, prev_row
724+
result = prev_row[n]
725+
_levenshtein_distance_cache[key] = result
726+
return result
727+
701728
def text_keywords_distance_levenshtein(text: str, keywords: set[str]):
702-
pass
729+
text = re.sub(r'\s+', ' ', text).lower()
730+
words = text.split()
731+
result = {}
732+
distance_threshold = 2
733+
word_len_min = 4
734+
for keyword in keywords:
735+
collects = []
736+
result[keyword] = collects
737+
if len(keyword) <= word_len_min:
738+
continue
739+
distances = [
740+
(i, word, levenshtein_distance(word, keyword))
741+
for i, word in enumerate(words) if len(word) > word_len_min
742+
]
743+
for index, word, dist in distances:
744+
if dist >= distance_threshold:
745+
continue
746+
collects.append((index, len(word), (distance_threshold - dist) / distance_threshold))
747+
return result
703748

704749
def extract_cv_summary(text: str):
705750
pass

0 commit comments

Comments
 (0)