Skip to content

Commit 7357f1b

Browse files
committed
feat: implement Aho-Corasick algorithm integration for pattern matching
1 parent 43fe62a commit 7357f1b

File tree

3 files changed

+161
-2
lines changed

3 files changed

+161
-2
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ sudo apt update && sudo apt install -y libxcb-cursor0 libxcb-cursor-dev qt6-base
6666

6767
```bash
6868
uv sync
69-
uv run python src/main.py
69+
uv run python -m src.main
7070
```
7171

7272
## Project Structure

src/AhoCorasick.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from collections import deque, defaultdict
2+
3+
class TrieNode:
4+
def __init__(self):
5+
self.children = {}
6+
self.failure = None
7+
self.output = []
8+
self.is_end_of_word = False
9+
10+
class AhoCorasickMatcher:
11+
def __init__(self):
12+
self.root = TrieNode()
13+
14+
def build_trie(self, patterns):
15+
"""Build the trie structure with all patterns"""
16+
for pattern in patterns:
17+
if not pattern: # Skip empty patterns
18+
continue
19+
current = self.root
20+
for char in pattern:
21+
if char not in current.children:
22+
current.children[char] = TrieNode()
23+
current = current.children[char]
24+
current.is_end_of_word = True
25+
current.output.append(pattern)
26+
27+
def build_failure_function(self):
28+
"""Build the failure function using BFS"""
29+
queue = deque()
30+
31+
# Initialize failure function for depth 1 nodes
32+
for child in self.root.children.values():
33+
child.failure = self.root
34+
queue.append(child)
35+
36+
# Build failure function for remaining nodes
37+
while queue:
38+
current = queue.popleft()
39+
40+
for char, child in current.children.items():
41+
queue.append(child)
42+
43+
# Find the failure link
44+
failure = current.failure
45+
while failure is not None and char not in failure.children:
46+
failure = failure.failure
47+
48+
if failure is not None:
49+
child.failure = failure.children[char]
50+
else:
51+
child.failure = self.root
52+
53+
# Add output patterns from failure node
54+
child.output.extend(child.failure.output)
55+
56+
def search_patterns(self, text, patterns, case_sensitive=False):
57+
"""
58+
Search for multiple patterns simultaneously in text.
59+
Returns a dictionary mapping each pattern to list of starting positions.
60+
"""
61+
if not patterns or not text:
62+
return {pattern: [] for pattern in patterns}
63+
64+
# Convert to lowercase if case insensitive
65+
if not case_sensitive:
66+
text = text.lower()
67+
patterns = [pattern.lower() for pattern in patterns]
68+
69+
# Build the automaton
70+
self.build_trie(patterns)
71+
self.build_failure_function()
72+
73+
# Initialize results dictionary
74+
results = {pattern: [] for pattern in patterns}
75+
76+
# Search through the text
77+
current = self.root
78+
79+
for i, char in enumerate(text):
80+
# Follow failure links until we find a match or reach root
81+
while current is not None and char not in current.children:
82+
current = current.failure
83+
84+
if current is None:
85+
current = self.root
86+
continue
87+
88+
current = current.children[char]
89+
90+
# Check for pattern matches
91+
for pattern in current.output:
92+
start_pos = i - len(pattern) + 1
93+
results[pattern].append(start_pos)
94+
95+
return results
96+
97+
def search_pattern(self, text, pattern, case_sensitive=False):
98+
"""
99+
Search for a single pattern in text.
100+
Returns a list of starting positions (compatible with KMP/BM interface).
101+
"""
102+
if not pattern or not text:
103+
return []
104+
105+
result = self.search_patterns(text, [pattern], case_sensitive)
106+
return result.get(pattern, [])
107+
108+
def count_patterns(self, text, patterns, case_sensitive=False):
109+
results = self.search_patterns(text, patterns, case_sensitive)
110+
return {pattern: len(positions) for pattern, positions in results.items()}
111+
112+
def find_first_occurrence(self, text, patterns, case_sensitive=False):
113+
if not patterns or not text:
114+
return None, -1
115+
116+
# Convert to lowercase if case insensitive
117+
if not case_sensitive:
118+
text = text.lower()
119+
patterns = [pattern.lower() for pattern in patterns]
120+
121+
# Build the automaton
122+
self.build_trie(patterns)
123+
self.build_failure_function()
124+
125+
# Search for first match
126+
current = self.root
127+
128+
for i, char in enumerate(text):
129+
# Follow failure links until we find a match or reach root
130+
while current is not None and char not in current.children:
131+
current = current.failure
132+
133+
if current is None:
134+
current = self.root
135+
continue
136+
137+
current = current.children[char]
138+
139+
# Check for pattern matches
140+
if current.output:
141+
pattern = current.output[0] # Return first pattern found
142+
start_pos = i - len(pattern) + 1
143+
return pattern, start_pos
144+
145+
return None, -1

src/main.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,14 +325,17 @@ def __init__(self):
325325
self.algo_group = QHBoxLayout()
326326
self.kmp_button = QRadioButton("KMP")
327327
self.bm_button = QRadioButton("BM")
328+
self.ac_button = QRadioButton("Aho-Corasick")
328329
self.bm_button.setChecked(True)
329330

330331
self.algo_buttons = QButtonGroup(self)
331332
self.algo_buttons.addButton(self.kmp_button)
332333
self.algo_buttons.addButton(self.bm_button)
334+
self.algo_buttons.addButton(self.ac_button)
333335

334336
self.algo_group.addWidget(self.kmp_button)
335337
self.algo_group.addWidget(self.bm_button)
338+
self.algo_group.addWidget(self.ac_button)
336339

337340
# --- Top Matches Selector ---
338341
self.top_selector = QComboBox()
@@ -645,7 +648,12 @@ def cv_extract_information(data: dict, keywords: set[str], algo_exact: str):
645648
text = pdf_extract_text(data["cv_path"])
646649
additional_text = f'{data["id"]} {data["applicant_id"]} {data["application_role"]} {data["first_name"]} {data["last_name"]} {data["date_of_birth"]} {data["address"]} {data["phone_number"]}'
647650
time_text = time.monotonic_ns()
648-
keywords_exact = text_keywords_exact_kmp(text + additional_text, keywords) if algo_exact == "KMP" else text_keywords_exact_bm(text + additional_text, keywords)
651+
if algo_exact == "KMP":
652+
keywords_exact = text_keywords_exact_kmp(text + additional_text, keywords)
653+
elif algo_exact == "BM":
654+
keywords_exact = text_keywords_exact_bm(text + additional_text, keywords)
655+
else:
656+
keywords_exact = text_keywords_exact_aco(text + additional_text, keywords)
649657
time_keywords_exact = time.monotonic_ns()
650658
keywords_distance = text_keywords_distance_levenshtein(text + additional_text, keywords)
651659
time_keywords_distance = time.monotonic_ns()
@@ -673,6 +681,7 @@ def pdf_extract_text(cv_path: str):
673681

674682
from .KMP import KMPMatcher
675683
from .BoyerMoore import BoyerMooreMatcher
684+
from .AhoCorasick import AhoCorasickMatcher
676685

677686
def text_keywords_exact_kmp(text: str, keywords: set[str]):
678687
text = re.sub(r'\s+', ' ', text)
@@ -684,6 +693,11 @@ def text_keywords_exact_bm(text: str, keywords: set[str]):
684693
matcher = BoyerMooreMatcher()
685694
return {keyword: matcher.search_pattern(text, keyword) for keyword in keywords}
686695

696+
def text_keywords_exact_aco(text: str, keywords: set[str]):
697+
text = re.sub(r'\s+', ' ', text)
698+
matcher = AhoCorasickMatcher()
699+
return matcher.search_patterns(text, list(keywords))
700+
687701
def text_keywords_distance_levenshtein(text: str, keywords: set[str]):
688702
pass
689703

0 commit comments

Comments
 (0)