optimized regex, separated backend functions, and generally made bett…

…er readable code
hitblast · Aug 11, 2024 · 6960255 · 6960255
1 parent 172e98f
commit 6960255
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 163 deletions.
diff --git a/avro/__init__.py b/avro/__init__.py
@@ -11,5 +11,5 @@
 from .main import *
 
 # Version information.
-__version_info__ = (2024, 8, 8)
+__version_info__ = (2024, 8, 11)
 __version__ = ".".join(map(str, __version_info__))
diff --git a/avro/main.py b/avro/main.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: MIT
 
-
 # Import first-party Python libraries.
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -13,8 +12,8 @@
 from .utils.config import BIJOY_MAP, BIJOY_MAP_REVERSE
 
 
-# The helper function for handling multithreaded workloads.
-def _concurrency_helper(func: Callable, params: Tuple[str]) -> List[str]:
+# Concurrency helper function for handling multithreaded workloads.
+def _concurrency_helper(func: Callable, params: Tuple[str, ...]) -> List[str]:
     output = []
 
     with ThreadPoolExecutor() as executor:
@@ -33,78 +32,112 @@ def _concurrency_helper(func: Callable, params: Tuple[str]) -> List[str]:
 REVERSE_REGEX = re.compile(r"(\s|\.|,|\?|।|-|;|')", re.UNICODE)
 
 
-# The primary parse function for the library.
-def parse(*texts: str, bijoy: bool = False, remap_words: bool = True) -> Union[str, List[str]]:
-    """
-    #### Parses input text, matches and replaces using the Avro Dictionary.
+# Backend Functions.
+# The output generator for the parse function.
+def _parse_output_generator(fixed_text: str, cur_end: int) -> Generator[str, None, None]:
+    for cur, i in enumerate(fixed_text):
+        uni_pass = UTF8_REGEX.match(i) is not None
+        if not uni_pass:
+            cur_end = cur + 1
+            yield i
+        elif cur >= cur_end and uni_pass:
+            match = processor.match_patterns(fixed_text, cur, rule=False)
+            matched = match["matched"]
+            if matched:
+                yield match["replaced"]
+                cur_end = cur + len(match["found"])
+            else:
+                match = processor.match_patterns(fixed_text, cur, rule=True)
+                matched = match["matched"]
+                if matched:
+                    cur_end = cur + len(match["found"])
+                    replaced = processor.process_rules(
+                        rules=match["rules"], fixed_text=fixed_text, cur=cur, cur_end=cur_end
+                    )
+                    if replaced:
+                        yield replaced
+                    else:
+                        yield match["replaced"]
+            if not matched:
+                cur_end = cur + 1
+                yield i
 
-    If a valid replacement is found, then it returns the replaced string.
-    If no replacement is found, then it instead returns the input text.
 
-    Parameters:
-    - `*texts: str | Tuple[str]`: The text(s) to parse.
-    - `bijoy: bool = False`: Whether to return in the Bijoy Keyboard format (ASCII).
-    - `remap_words: bool = True`: Whether to parse input text with remapped (exception) words.
+# The working backend for the parse() function.
+@lru_cache(maxsize=128)
+def _parse_backend(text: str, remap_words: bool) -> str:
+    fixed_text = validate.fix_string_case(text)  # Sanitize input text.
+    manual_required = True  # Whether manual intervention is required.
+    cur_end = 0  # Cursor end point.
 
-    Usage:
-    ```python
-    import avro
+    # Replace predefined exceptions in the input text.
+    if remap_words:
+        fixed_text, manual_required = processor.find_in_remap(fixed_text)
 
-    parsed = avro.parse('ami banglay gan gai')
-    print(parsed)
-    ```
-    """
+    return (
+        "".join(chain.from_iterable(_parse_output_generator(fixed_text, cur_end)))
+        if manual_required
+        else fixed_text
+    )
 
-    @lru_cache(maxsize=128)
-    def _parse_backend(text: str) -> str:
-        fixed_text = validate.fix_string_case(text)  # Sanitize input text.
-        manual_required = True  # Whether manual intervention is required.
-        cur_end = 0  # Cursor end point.
 
-        # Replace predefined exceptions in the input text.
-        if remap_words:
-            fixed_text, manual_required = processor.find_in_remap(fixed_text)
+# The working backend for the to_bijoy() function.
+@lru_cache(maxsize=128)
+def _convert_backend(text: str) -> str:
+    text = processor.rearrange_unicode_text(re.sub("ৌ", "ৌ", re.sub("ো", "ো", text)))
 
-        def output_generator() -> Generator[str, None, None]:
-            nonlocal cur_end
+    for unic in BIJOY_MAP:
+        text = re.sub(unic, BIJOY_MAP[unic], text)
 
-            # Iterate through input text.
-            for cur, i in enumerate(fixed_text):
-                uni_pass = UTF8_REGEX.match(i) is not None
+    return text.strip()
 
-                if not uni_pass:
-                    cur_end = cur + 1
-                    yield i
 
-                elif cur >= cur_end and uni_pass:
-                    match = processor.match_patterns(fixed_text, cur, rule=False)
-                    matched = match["matched"]
+# The working backend for the to_unicode() function.
+@lru_cache(maxsize=128)
+def _convert_backend_unicode(text: str) -> str:
+    for ascii_c in BIJOY_MAP_REVERSE:
+        text = re.sub(re.escape(ascii_c), BIJOY_MAP_REVERSE[ascii_c], text)
 
-                    if matched:
-                        yield match["replaced"]
-                        cur_end = cur + len(match["found"])
-                    else:
-                        match = processor.match_patterns(fixed_text, cur, rule=True)
-                        matched = match["matched"]
+    text = re.sub("অা", "আ", processor.rearrange_bijoy_text(text))
+    return text.strip()
+
+
+# The output generator for the reverse function.
+def _reverse_output_generator(text: str) -> Generator[str, None, None]:
+    for cur, i in enumerate(text):
+        try:
+            i.encode("utf-8")
+            match = processor.match_patterns(text, cur, rule=False, reversed=True)
+            yield (match["reversed"] or match["found"]) if match["matched"] else i
+        except UnicodeDecodeError:
+            yield i
+
+
+# The working backend for the reverse() function.
+@lru_cache(maxsize=128)
+def _reverse_backend(text: str, remap_words: bool) -> str:
+    manual_required = True  # Whether manual intervention is required.
 
-                        if matched:
-                            cur_end = cur + len(match["found"])
-                            replaced = processor.process_rules(
-                                rules=match["rules"], fixed_text=fixed_text, cur=cur, cur_end=cur_end
-                            )
+    # Replace predefined exceptions in the input text.
+    if remap_words:
+        text, manual_required = processor.find_in_remap(text, reversed=True)
 
-                            if replaced:
-                                yield replaced
-                            else:
-                                yield match["replaced"]
+    return "".join(chain.from_iterable(_reverse_output_generator(text))) if manual_required else text
 
-                    if not matched:
-                        cur_end = cur + 1
-                        yield i
 
-        return "".join(chain.from_iterable(output_generator())) if manual_required else fixed_text
+# Backend extension for the reverse() function.
+@lru_cache(maxsize=128)
+def _reverse_backend_ext(text: str, remap_words: bool) -> str:
+    separated_texts = REVERSE_REGEX.split(text)
+    text_segments = [_reverse_backend(separated_text, remap_words) for separated_text in separated_texts]
+    return "".join(text_segments)
 
-    output = _concurrency_helper(_parse_backend, texts)
+
+# Primary user-end functions.
+# The parse() function.
+# Used to parse from English Roman script to Bengali in Unicode.
+def parse(*texts: str, bijoy: bool = False, remap_words: bool = True) -> Union[str, List[str]]:
+    output = _concurrency_helper(lambda text: _parse_backend(text, remap_words), texts)
 
     # If the `bijoy` parameter is set to `True`, then convert the output to Bijoy Keyboard format.
     if bijoy:
@@ -113,117 +146,22 @@ def output_generator() -> Generator[str, None, None]:
         return output[0] if len(output) == 1 else output
 
 
+# The to_bijoy() function.
+# Used to parse from Bengali in Unicode to Bijoy Keyboard format.
 def to_bijoy(*texts: str) -> Union[str, List[str]]:
-    """
-    #### Converts input text (Avro, Unicode) to Bijoy Keyboard format (ASCII).
-
-    If a valid conversion is found, then it returns the converted string.
-
-    Parameters:
-    - `*texts: str | Tuple[str]`: The text(s) to convert.
-
-    Usage:
-    ```python
-    import avro
-
-    converted = avro.to_bijoy('আমার সোনার বাংলা')
-    print(converted)
-    ```
-    """
-
-    @lru_cache(maxsize=128)
-    def _convert_backend(text: str) -> str:
-        text = processor.rearrange_unicode_text(re.sub("ৌ", "ৌ", re.sub("ো", "ো", text)))
-
-        for unic in BIJOY_MAP:
-            text = re.sub(unic, BIJOY_MAP[unic], text)
-
-        return text.strip()
-
     output = _concurrency_helper(_convert_backend, texts)
     return output[0] if len(output) == 1 else output
 
 
+# The to_unicode() function.
+# Used to parse from Bijoy Keyboard format to Bengali in Unicode.
 def to_unicode(*texts):
-    """
-    #### Converts input text (Bijoy Keyboard, ASCII) to Unicode (Avro Keyboard format).
-
-    If a valid conversion is found, then it returns the converted string.
-
-    Parameters:
-    - `*texts: str | Tuple[str]`: The text(s) to convert.
-
-    Usage:
-    ```python
-    import avro
-
-    converted = avro.to_unicode('Avwg evsjvh় Mvb MvB;')
-    print(converted)
-    ```
-    """
-
-    @lru_cache(maxsize=128)
-    def _convert_backend(text: str) -> str:
-        for ascii_c in BIJOY_MAP_REVERSE:
-            text = re.sub(re.escape(ascii_c), BIJOY_MAP_REVERSE[ascii_c], text)
-
-        text = re.sub("অা", "আ", processor.rearrange_bijoy_text(text))
-        return text.strip()
-
-    output = _concurrency_helper(_convert_backend, texts)
+    output = _concurrency_helper(_convert_backend_unicode, texts)
     return output[0] if len(output) == 1 else output
 
 
+# The reverse() function.
+# Used to parse from Bengali in Unicode to English Roman script.
 def reverse(*texts: str, remap_words: bool = True) -> Union[str, List[str]]:
-    """
-    #### Reverses input text to Roman script typed in English.
-
-    If a valid replacement is found, then it returns the replaced string.
-    If no replacement is found, then it instead returns the input text.
-
-    Parameters:
-    - `*texts: str | Tuple[str]`: The text(s) to reverse.
-    - `remap_words: bool = True`: Whether to reverse input text with remapped (exception) words.
-
-    Usage:
-    ```python
-    import avro
-
-    reversed = avro.reverse('আমার সোনার বাংলা')
-    print(reversed)
-    ```
-    """
-
-    # Internal function for multiple reverses.
-    @lru_cache(maxsize=128)
-    def _reverse_backend(text: str) -> str:
-        manual_required = True  # Whether manual intervention is required.
-
-        # Replace predefined exceptions in the input text.
-        if remap_words:
-            text, manual_required = processor.find_in_remap(text, reversed=True)
-
-        # Iterate through input text.
-        def output_generator() -> Generator[str, None, None]:
-            for cur, i in enumerate(text):
-                try:
-                    i.encode("utf-8")
-                    match = processor.match_patterns(text, cur, rule=False, reversed=True)
-
-                    yield (match["reversed"] or match["found"]) if match["matched"] else i
-
-                except UnicodeDecodeError:
-                    yield i
-
-        return "".join(chain.from_iterable(output_generator())) if manual_required else text
-
-    # Extension for the _reverse_backend() function.
-    @lru_cache(maxsize=128)
-    def _reverse_backend_ext(text: str) -> str:
-        separated_texts = REVERSE_REGEX.split(text)
-        text_segments = [_reverse_backend(separated_text) for separated_text in separated_texts]
-        return "".join(text_segments)
-
-    # Prepare final output.
-    output = _concurrency_helper(_reverse_backend_ext, texts)
+    output = _concurrency_helper(lambda text: _reverse_backend_ext(text, remap_words), texts)
     return output[0] if len(output) == 1 else output