Improve ws and split CLI (#99)

Added flags and file input functionalities to split. Updated usage commands in the README. First progress on #97 and #98, but still WIP.
ashvardanian · Feb 21, 2024 · c878caf · c878caf
1 parent ec8abc2
commit c878caf
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 57 deletions.
diff --git a/cli/README.md b/cli/README.md
@@ -6,7 +6,7 @@ The `wc` utility on Linux can be used to count the number of lines, words, and b
 Using SIMD-accelerated character and character-set search, StringZilla, even with slow SSDs, it can be noticeably faster.
 
 ```bash
-$ time wc enwik9.txt 
+$ time wc enwik9.txt
   13147025  129348346 1000000000 enwik9.txt
 
 real    0m3.562s
@@ -33,7 +33,7 @@ real    0m6.424s
 user    0m0.179s
 sys     0m0.663s
 
-$ time cli/split.py 100000 enwik9.txt ...
+$ time cli/split.py -l 100000 enwik9.txt ...
 
 real    0m1.482s
 user    0m1.020s
@@ -45,4 +45,4 @@ sys     0m0.460s
 What other interfaces should be added?
 
 - Levenshtein distances?
-- Fuzzy search?
+- Fuzzy search?
diff --git a/cli/split.py b/cli/split.py
@@ -1,71 +1,104 @@
 #!/usr/bin/env python3
 
+import argparse
 import sys
-
+import stringzilla
 from stringzilla import File, Str
 
 
-def split_file(file_path, lines_per_file, output_prefix):
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Output pieces of FILE to PREFIXaa, PREFIXab, ...; default size is 1000 lines, and default PREFIX is 'x'."
+    )
+    parser.add_argument(
+        "file", nargs="?", default="-", help='File to process, "-" for standard input'
+    )
+    parser.add_argument(
+        "prefix", nargs="?", default="X", help='Output file prefix, default is "x"'
+    )
+    parser.add_argument(
+        "-l",
+        "--lines",
+        type=int,
+        default=1000,
+        help="Number of lines per output file, default is 1000",
+    )
+    parser.add_argument(
+        "-t",
+        "--separator",
+        default="\n",
+        help="Use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character",
+    )
+    parser.add_argument(
+        "-n",
+        "--number",
+        type=int,
+        default=None,
+        help="Generate N output files based on size of input",
+    )
+    parser.add_argument("--version", action="version", version=stringzilla.__version__)
+    return parser.parse_args()
+
+
+def split_file(file_path, lines_per_file, output_prefix, separator, number_of_files):
     try:
-        # 1. Memory-map the large file
-        file_mapped = File(file_path)
-        file_contents = Str(file_mapped)
-
-        # Variables to keep track of the current position and file part number
+        if separator == "\\0":
+            separator = "\0"
+        if file_path == "-":
+            file_contents = Str(sys.stdin.read())
+        else:
+            file_mapped = File(file_path)
+            file_contents = Str(file_mapped)
+
+        if number_of_files is not None:
+            total_length = len(file_contents)
+            chunk_size = total_length // number_of_files
+            for file_part in range(number_of_files):
+                start = file_part * chunk_size
+                end = (
+                    start + chunk_size
+                    if file_part < number_of_files - 1
+                    else total_length
+                )
+                current_slice = file_contents[start:end]
+                output_path = f"{output_prefix}{file_part}"
+                current_slice.write_to(output_path)
+            return
         current_position = 0
         file_part = 0
-        newline_position = (
-            -1
-        )  # Start before file begins to find the first newline correctly
+        newline_position = -1
 
-        # Loop until the end of the file
         while current_position < len(file_contents):
-            # 2. Loop to skip `lines_per_file` lines
             for _ in range(lines_per_file):
-                newline_position = file_contents.find("\n", newline_position + 1)
-                if newline_position == -1:  # No more newlines
+                newline_position = file_contents.find(separator, newline_position + 1)
+                if newline_position == -1:
                     break
 
-            # If no newlines were found and we're not at the start, process the rest of the file
             if newline_position == -1 and current_position < len(file_contents):
                 newline_position = len(file_contents)
 
-            # 3. Use offset_within to get the length of the current section
-            # Assuming offset_within gives you the length from the current position
             section_length = (
                 newline_position - current_position if newline_position != -1 else 0
             )
 
-            # Extract the current section to write out
-            if section_length > 0:  # Prevent creating empty files
+            if section_length > 0:
                 current_slice = file_contents[current_position : newline_position + 1]
-
-                # 4. Save the current slice to file
                 output_path = f"{output_prefix}{file_part}"
                 current_slice.write_to(output_path)
 
-                # Prepare for the next slice
                 file_part += 1
                 current_position = newline_position + 1
 
     except FileNotFoundError:
         print(f"No such file: {file_path}")
     except Exception as e:
         print(f"An error occurred: {e}")
+        print("Usage example: split.py [-l LINES] [file] [prefix]")
 
 
 def main():
-    if len(sys.argv) < 4:
-        print(
-            "Usage: python split_file.py <lines_per_file> <input_file> <output_prefix>"
-        )
-        sys.exit(1)
-
-    lines_per_file = int(sys.argv[1])
-    file_path = sys.argv[2]
-    output_prefix = sys.argv[3]
-
-    split_file(file_path, lines_per_file, output_prefix)
+    args = parse_arguments()
+    split_file(args.file, args.lines, args.prefix, args.separator, args.number)
 
 
 if __name__ == "__main__":

diff --git a/cli/wc.py b/cli/wc.py
@@ -1,36 +1,116 @@
 #!/usr/bin/env python3
 
 import sys
-
+import argparse
+import stringzilla
 from stringzilla import File, Str
 
 
-def wc(file_path):
-    try:
-        mapped_file = File(file_path)
-        mapped_bytes = Str(mapped_file)
-        line_count = mapped_bytes.count("\n")
-        word_count = mapped_bytes.count(" ")
-        char_count = mapped_bytes.__len__()
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Print newline, word, and byte counts for each FILE, and a total line if more than one FILE is \
+        specified. A word is a non-zero-length sequence of acters delimited by white space."
+    )
+    parser.add_argument("files", nargs="*", default=["-"], help="Files to process")
+    parser.add_argument(
+        "-c", "--bytes", action="store_true", help="print the byte counts"
+    )
+    parser.add_argument(
+        "-m", "--chars", action="store_true", help="print the character counts"
+    )
+    parser.add_argument(
+        "-l", "--lines", action="store_true", help="print the newline counts"
+    )
+    parser.add_argument(
+        "-L",
+        "--max-line-length",
+        action="store_true",
+        help="print the maximum display width",
+    )
+    parser.add_argument(
+        "-w", "--words", action="store_true", help="print the word counts"
+    )
+    parser.add_argument("--version", action="version", version=stringzilla.__version__)
+    return parser.parse_args()
+
+
+def wc(file_path, args):
+    if file_path == "-":  # read from stdin
+        content = sys.stdin.read()
+        mapped_bytes = Str(content)
+    else:
+        try:
+            mapped_file = File(file_path)
+            mapped_bytes = Str(mapped_file)
+        except RuntimeError:  # File gives a RuntimeError if the file does not exist
+            return f"No such file: {file_path}", False
+
+    line_count = mapped_bytes.count("\n")
+    word_count = mapped_bytes.count(" ") + 1
+    char_count = mapped_bytes.__len__()
+    counts = {
+        "line_count": line_count,
+        "word_count": word_count,
+        "char_count": char_count,
+    }
+
+    if args.max_line_length:
+        max_line_length = max(len(line) for line in str(mapped_bytes).split("\n"))
+        counts["max_line_length"] = max_line_length
 
-        return line_count, word_count, char_count
-    except FileNotFoundError:
-        return f"No such file: {file_path}"
+    if args.bytes or args.chars:
+        byte_count = char_count  # assume 1 char = 1 byte
+        counts["byte_count"] = byte_count
+
+    return counts, True
+
+
+def format_output(counts, args):
+    selected_counts = []
+    if args.lines:
+        selected_counts.append(counts["line_count"])
+    if args.words:
+        selected_counts.append(counts["word_count"])
+    if args.chars:
+        selected_counts.append(counts["char_count"])
+    if args.bytes:
+        selected_counts.append(counts.get("byte_count", counts["char_count"]))
+    if args.max_line_length:
+        selected_counts.append(counts.get("max_line_length", 0))
+
+    if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
+        selected_counts = [
+            counts["line_count"],
+            counts["word_count"],
+            counts["char_count"],
+        ]
+
+    return " ".join(str(count) for count in selected_counts)
 
 
 def main():
-    if len(sys.argv) < 2:
-        print("Usage: python wc.py <file>")
-        sys.exit(1)
+    args = parse_arguments()
+    total_counts = {
+        "line_count": 0,
+        "word_count": 0,
+        "char_count": 0,
+        "max_line_length": 0,
+        "byte_count": 0,
+    }
 
-    file_path = sys.argv[1]
-    counts = wc(file_path)
+    for file_path in args.files:
+        counts, success = wc(file_path, args)
+        if success:
+            for key in total_counts.keys():
+                total_counts[key] += counts.get(key, 0)
+            output = format_output(counts, args) + f" {file_path}"
+            print(output)
+        else:
+            print(counts)
 
-    if isinstance(counts, tuple):
-        line_count, word_count, char_count = counts
-        print(f"{line_count} {word_count} {char_count} {file_path}")
-    else:
-        print(counts)
+    if len(args.files) > 1:
+        total_output = format_output(total_counts, args) + " total"
+        print(total_output)
 
 
 if __name__ == "__main__":