Skip to content

Commit

Permalink
optimized regex, separated backend functions, and generally made bett…
Browse files Browse the repository at this point in the history
…er readable code
  • Loading branch information
hitblast committed Aug 11, 2024
1 parent 172e98f commit 6960255
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 163 deletions.
2 changes: 1 addition & 1 deletion avro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@
from .main import *

# Version information.
__version_info__ = (2024, 8, 8)
__version_info__ = (2024, 8, 11)
__version__ = ".".join(map(str, __version_info__))
262 changes: 100 additions & 162 deletions avro/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: MIT


# Import first-party Python libraries.
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand All @@ -13,8 +12,8 @@
from .utils.config import BIJOY_MAP, BIJOY_MAP_REVERSE


# The helper function for handling multithreaded workloads.
def _concurrency_helper(func: Callable, params: Tuple[str]) -> List[str]:
# Concurrency helper function for handling multithreaded workloads.
def _concurrency_helper(func: Callable, params: Tuple[str, ...]) -> List[str]:
output = []

with ThreadPoolExecutor() as executor:
Expand All @@ -33,78 +32,112 @@ def _concurrency_helper(func: Callable, params: Tuple[str]) -> List[str]:
REVERSE_REGEX = re.compile(r"(\s|\.|,|\?|।|-|;|')", re.UNICODE)


# The primary parse function for the library.
def parse(*texts: str, bijoy: bool = False, remap_words: bool = True) -> Union[str, List[str]]:
"""
#### Parses input text, matches and replaces using the Avro Dictionary.
# Backend Functions.
# The output generator for the parse function.
def _parse_output_generator(fixed_text: str, cur_end: int) -> Generator[str, None, None]:
for cur, i in enumerate(fixed_text):
uni_pass = UTF8_REGEX.match(i) is not None
if not uni_pass:
cur_end = cur + 1
yield i
elif cur >= cur_end and uni_pass:
match = processor.match_patterns(fixed_text, cur, rule=False)
matched = match["matched"]
if matched:
yield match["replaced"]
cur_end = cur + len(match["found"])
else:
match = processor.match_patterns(fixed_text, cur, rule=True)
matched = match["matched"]
if matched:
cur_end = cur + len(match["found"])
replaced = processor.process_rules(
rules=match["rules"], fixed_text=fixed_text, cur=cur, cur_end=cur_end
)
if replaced:
yield replaced
else:
yield match["replaced"]
if not matched:
cur_end = cur + 1
yield i

If a valid replacement is found, then it returns the replaced string.
If no replacement is found, then it instead returns the input text.

Parameters:
- `*texts: str | Tuple[str]`: The text(s) to parse.
- `bijoy: bool = False`: Whether to return in the Bijoy Keyboard format (ASCII).
- `remap_words: bool = True`: Whether to parse input text with remapped (exception) words.
# The working backend for the parse() function.
@lru_cache(maxsize=128)
def _parse_backend(text: str, remap_words: bool) -> str:
fixed_text = validate.fix_string_case(text) # Sanitize input text.
manual_required = True # Whether manual intervention is required.
cur_end = 0 # Cursor end point.

Usage:
```python
import avro
# Replace predefined exceptions in the input text.
if remap_words:
fixed_text, manual_required = processor.find_in_remap(fixed_text)

parsed = avro.parse('ami banglay gan gai')
print(parsed)
```
"""
return (
"".join(chain.from_iterable(_parse_output_generator(fixed_text, cur_end)))
if manual_required
else fixed_text
)

@lru_cache(maxsize=128)
def _parse_backend(text: str) -> str:
fixed_text = validate.fix_string_case(text) # Sanitize input text.
manual_required = True # Whether manual intervention is required.
cur_end = 0 # Cursor end point.

# Replace predefined exceptions in the input text.
if remap_words:
fixed_text, manual_required = processor.find_in_remap(fixed_text)
# The working backend for the to_bijoy() function.
@lru_cache(maxsize=128)
def _convert_backend(text: str) -> str:
text = processor.rearrange_unicode_text(re.sub("ৌ", "ৌ", re.sub("ো", "ো", text)))

def output_generator() -> Generator[str, None, None]:
nonlocal cur_end
for unic in BIJOY_MAP:
text = re.sub(unic, BIJOY_MAP[unic], text)

# Iterate through input text.
for cur, i in enumerate(fixed_text):
uni_pass = UTF8_REGEX.match(i) is not None
return text.strip()

if not uni_pass:
cur_end = cur + 1
yield i

elif cur >= cur_end and uni_pass:
match = processor.match_patterns(fixed_text, cur, rule=False)
matched = match["matched"]
# The working backend for the to_unicode() function.
@lru_cache(maxsize=128)
def _convert_backend_unicode(text: str) -> str:
for ascii_c in BIJOY_MAP_REVERSE:
text = re.sub(re.escape(ascii_c), BIJOY_MAP_REVERSE[ascii_c], text)

if matched:
yield match["replaced"]
cur_end = cur + len(match["found"])
else:
match = processor.match_patterns(fixed_text, cur, rule=True)
matched = match["matched"]
text = re.sub("অা", "আ", processor.rearrange_bijoy_text(text))
return text.strip()


# The output generator for the reverse function.
def _reverse_output_generator(text: str) -> Generator[str, None, None]:
for cur, i in enumerate(text):
try:
i.encode("utf-8")
match = processor.match_patterns(text, cur, rule=False, reversed=True)
yield (match["reversed"] or match["found"]) if match["matched"] else i
except UnicodeDecodeError:
yield i


# The working backend for the reverse() function.
@lru_cache(maxsize=128)
def _reverse_backend(text: str, remap_words: bool) -> str:
manual_required = True # Whether manual intervention is required.

if matched:
cur_end = cur + len(match["found"])
replaced = processor.process_rules(
rules=match["rules"], fixed_text=fixed_text, cur=cur, cur_end=cur_end
)
# Replace predefined exceptions in the input text.
if remap_words:
text, manual_required = processor.find_in_remap(text, reversed=True)

if replaced:
yield replaced
else:
yield match["replaced"]
return "".join(chain.from_iterable(_reverse_output_generator(text))) if manual_required else text

if not matched:
cur_end = cur + 1
yield i

return "".join(chain.from_iterable(output_generator())) if manual_required else fixed_text
# Backend extension for the reverse() function.
@lru_cache(maxsize=128)
def _reverse_backend_ext(text: str, remap_words: bool) -> str:
separated_texts = REVERSE_REGEX.split(text)
text_segments = [_reverse_backend(separated_text, remap_words) for separated_text in separated_texts]
return "".join(text_segments)

output = _concurrency_helper(_parse_backend, texts)

# Primary user-end functions.
# The parse() function.
# Used to parse from English Roman script to Bengali in Unicode.
def parse(*texts: str, bijoy: bool = False, remap_words: bool = True) -> Union[str, List[str]]:
output = _concurrency_helper(lambda text: _parse_backend(text, remap_words), texts)

# If the `bijoy` parameter is set to `True`, then convert the output to Bijoy Keyboard format.
if bijoy:
Expand All @@ -113,117 +146,22 @@ def output_generator() -> Generator[str, None, None]:
return output[0] if len(output) == 1 else output


# The to_bijoy() function.
# Used to parse from Bengali in Unicode to Bijoy Keyboard format.
def to_bijoy(*texts: str) -> Union[str, List[str]]:
"""
#### Converts input text (Avro, Unicode) to Bijoy Keyboard format (ASCII).
If a valid conversion is found, then it returns the converted string.
Parameters:
- `*texts: str | Tuple[str]`: The text(s) to convert.
Usage:
```python
import avro
converted = avro.to_bijoy('আমার সোনার বাংলা')
print(converted)
```
"""

@lru_cache(maxsize=128)
def _convert_backend(text: str) -> str:
text = processor.rearrange_unicode_text(re.sub("ৌ", "ৌ", re.sub("ো", "ো", text)))

for unic in BIJOY_MAP:
text = re.sub(unic, BIJOY_MAP[unic], text)

return text.strip()

output = _concurrency_helper(_convert_backend, texts)
return output[0] if len(output) == 1 else output


# The to_unicode() function.
# Used to parse from Bijoy Keyboard format to Bengali in Unicode.
def to_unicode(*texts):
"""
#### Converts input text (Bijoy Keyboard, ASCII) to Unicode (Avro Keyboard format).
If a valid conversion is found, then it returns the converted string.
Parameters:
- `*texts: str | Tuple[str]`: The text(s) to convert.
Usage:
```python
import avro
converted = avro.to_unicode('Avwg evsjvh় Mvb MvB;')
print(converted)
```
"""

@lru_cache(maxsize=128)
def _convert_backend(text: str) -> str:
for ascii_c in BIJOY_MAP_REVERSE:
text = re.sub(re.escape(ascii_c), BIJOY_MAP_REVERSE[ascii_c], text)

text = re.sub("অা", "আ", processor.rearrange_bijoy_text(text))
return text.strip()

output = _concurrency_helper(_convert_backend, texts)
output = _concurrency_helper(_convert_backend_unicode, texts)
return output[0] if len(output) == 1 else output


# The reverse() function.
# Used to parse from Bengali in Unicode to English Roman script.
def reverse(*texts: str, remap_words: bool = True) -> Union[str, List[str]]:
"""
#### Reverses input text to Roman script typed in English.
If a valid replacement is found, then it returns the replaced string.
If no replacement is found, then it instead returns the input text.
Parameters:
- `*texts: str | Tuple[str]`: The text(s) to reverse.
- `remap_words: bool = True`: Whether to reverse input text with remapped (exception) words.
Usage:
```python
import avro
reversed = avro.reverse('আমার সোনার বাংলা')
print(reversed)
```
"""

# Internal function for multiple reverses.
@lru_cache(maxsize=128)
def _reverse_backend(text: str) -> str:
manual_required = True # Whether manual intervention is required.

# Replace predefined exceptions in the input text.
if remap_words:
text, manual_required = processor.find_in_remap(text, reversed=True)

# Iterate through input text.
def output_generator() -> Generator[str, None, None]:
for cur, i in enumerate(text):
try:
i.encode("utf-8")
match = processor.match_patterns(text, cur, rule=False, reversed=True)

yield (match["reversed"] or match["found"]) if match["matched"] else i

except UnicodeDecodeError:
yield i

return "".join(chain.from_iterable(output_generator())) if manual_required else text

# Extension for the _reverse_backend() function.
@lru_cache(maxsize=128)
def _reverse_backend_ext(text: str) -> str:
separated_texts = REVERSE_REGEX.split(text)
text_segments = [_reverse_backend(separated_text) for separated_text in separated_texts]
return "".join(text_segments)

# Prepare final output.
output = _concurrency_helper(_reverse_backend_ext, texts)
output = _concurrency_helper(lambda text: _reverse_backend_ext(text, remap_words), texts)
return output[0] if len(output) == 1 else output

0 comments on commit 6960255

Please sign in to comment.