@@ -272,8 +272,14 @@ def __init__(self):
272272
273273 self .setLayout (self .main_layout )
274274
275- def update (self , data , match_count ):
276- self .primary_label .setText (f'<b>{ data ["first_name" ]} { data ["last_name" ]} </b> – { data ["application_role" ]} ({ match_count } keywords matched)' )
275+ def update (self , data , exact_match_score , distance_match_score ):
276+ exact_match_count = exact_match_score
277+ distance_match_count = sum (len (v ) for v in data ["result" ]["keywords_distance" ].values ())
278+ match_description = ", " .join ([v for v in [
279+ (f"{ exact_match_count } exact keywords matched" if exact_match_count > 0 else None ),
280+ (f"{ distance_match_count } distance keywords matched" if distance_match_count > 0 else None )
281+ ] if v is not None ])
282+ self .primary_label .setText (f'<b>{ data ["first_name" ]} { data ["last_name" ]} </b> – { data ["application_role" ]} \n <i>{ match_description } </i>' )
277283 self .data = data
278284
279285 @asyncSlot ()
@@ -502,14 +508,11 @@ async def search_button_clicked(self):
502508 cv_cards = []
503509 @debounce (0.05 , 0.1 )
504510 async def update_ui ():
505- results_len = len (results )
506- if results_len == 0 :
507- results_len = sys .float_info .min
508511 time_elapsed = (time .monotonic_ns () - time_start ) / 1000000
509- time_text_adjusted = time_text / 1000000 / len (results )
510- time_keywords_exact_adjusted = time_keywords_exact / 1000000 / len (results )
511- time_keywords_distance_adjusted = time_keywords_distance / 1000000 / len (results )
512- time_summary_adjusted = time_summary / 1000000 / len (results )
512+ time_text_adjusted = time_text / 1000000 / len (results ) if len ( results ) > 0 else 0
513+ time_keywords_exact_adjusted = time_keywords_exact / 1000000 / len (results ) if len ( results ) > 0 else 0
514+ time_keywords_distance_adjusted = time_keywords_distance / 1000000 / len (results ) if len ( results ) > 0 else 0
515+ time_summary_adjusted = time_summary / 1000000 / len (results ) if len ( results ) > 0 else 0
513516 time_total_adjusted = time_text_adjusted + time_keywords_exact_adjusted + time_keywords_distance_adjusted + time_summary_adjusted
514517 if time_total_adjusted == 0 :
515518 time_total_adjusted = sys .float_info .min
@@ -522,22 +525,22 @@ async def update_ui():
522525 f"Time Total: { time_total_adjusted :.2f} ms/file ({ time_total_adjusted / time_total_adjusted * time_elapsed / 1000 :.2f} s cumulative)"
523526 )
524527 entries = sorted (
525- [(r , sum (len (v ) for v in r ["result" ]["keywords_exact" ].values ())) for r in results if sum ( len ( v ) for v in r ["result" ]["keywords_exact " ].values ()) > 0 ],
526- key = lambda e : e [1 ],
528+ [(r , sum (len (v ) for v in r ["result" ]["keywords_exact" ].values ()), sum ( sum ( s [ 2 ] for s in v ) for v in r ["result" ]["keywords_distance " ].values ())) for r in results ],
529+ key = lambda e : 0.7 * e [1 ] + 0.3 * e [ 2 ],
527530 reverse = True
528531 )[:top_count ]
529532 i = 0
530- for entry , match_count in entries :
533+ for entry , exact_match_score , distance_match_score in entries :
531534 if i >= len (cv_cards ):
532535 cv_card = CVCard ()
533536 self .result_area .addWidget (cv_card )
534537 cv_cards .append (cv_card )
535538 cv_card = cv_cards [i ]
536- cv_card .update (entry , match_count )
539+ cv_card .update (entry , exact_match_score , distance_match_score )
537540 i += 1
538541 async with self .database .execute ("SELECT d.id, d.applicant_id, d.application_role, d.cv_path, p.first_name, p.last_name, p.date_of_birth, p.address, p.phone_number FROM application_details d JOIN applicant_profiles p ON p.id = d.applicant_id" ) as cursor :
539542 async for id , applicant_id , application_role , cv_path , first_name , last_name , date_of_birth , address , phone_number in cursor :
540- while len (futures ) >= 16 :
543+ while len (futures ) >= 32 :
541544 wait_event = asyncio .Event ()
542545 await wait_event .wait ()
543546 data = {
@@ -659,8 +662,8 @@ def cv_extract_information(data: dict, keywords: set[str], algo_exact: str):
659662 time_keywords_distance = time .monotonic_ns ()
660663 summary = extract_cv_summary (text )
661664 time_summary = time .monotonic_ns ()
662- with open (data ["cv_path" ] + ".txt" , "w" , encoding = "utf-8" ) as f :
663- f .write (text )
665+ # with open(data["cv_path"] + ".txt", "w", encoding="utf-8") as f:
666+ # f.write(text)
664667 return {
665668 "time_start" : time_start ,
666669 "text" : text ,
@@ -698,8 +701,50 @@ def text_keywords_exact_aco(text: str, keywords: set[str]):
698701 matcher = AhoCorasickMatcher ()
699702 return matcher .search_patterns (text , list (keywords ))
700703
704+ _levenshtein_distance_cache = {}
705+ def levenshtein_distance (a : str , b : str ) -> int :
706+ key = (a , b )
707+ if key in _levenshtein_distance_cache :
708+ return _levenshtein_distance_cache [key ]
709+ if len (_levenshtein_distance_cache ) >= 2048 :
710+ _levenshtein_distance_cache .pop (next (iter (_levenshtein_distance_cache )))
711+ m , n = len (a ), len (b )
712+ prev_row = list (range (n + 1 ))
713+ curr_row = [0 ] * (n + 1 )
714+ for i in range (1 , m + 1 ):
715+ curr_row [0 ] = i
716+ for j in range (1 , n + 1 ):
717+ cost = 0 if a [i - 1 ] == b [j - 1 ] else 1
718+ curr_row [j ] = min (
719+ prev_row [j ] + 1 ,
720+ curr_row [j - 1 ] + 1 ,
721+ prev_row [j - 1 ] + cost
722+ )
723+ prev_row , curr_row = curr_row , prev_row
724+ result = prev_row [n ]
725+ _levenshtein_distance_cache [key ] = result
726+ return result
727+
701728def text_keywords_distance_levenshtein (text : str , keywords : set [str ]):
702- pass
729+ text = re .sub (r'\s+' , ' ' , text ).lower ()
730+ words = text .split ()
731+ result = {}
732+ distance_threshold = 2
733+ word_len_min = 4
734+ for keyword in keywords :
735+ collects = []
736+ result [keyword ] = collects
737+ if len (keyword ) <= word_len_min :
738+ continue
739+ distances = [
740+ (i , word , levenshtein_distance (word , keyword ))
741+ for i , word in enumerate (words ) if len (word ) > word_len_min
742+ ]
743+ for index , word , dist in distances :
744+ if dist >= distance_threshold :
745+ continue
746+ collects .append ((index , len (word ), (distance_threshold - dist ) / distance_threshold ))
747+ return result
703748
704749def extract_cv_summary (text : str ):
705750 pass
0 commit comments