Skip to content

Commit

Permalink
Improve ws and split CLI (#99)
Browse files Browse the repository at this point in the history
Added flags and file input functionalities to split.
Updated usage commands in the README.
First progress on #97 and #98, but still WIP.
  • Loading branch information
ghazariann authored Feb 21, 2024
1 parent ec8abc2 commit c878caf
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 57 deletions.
6 changes: 3 additions & 3 deletions cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The `wc` utility on Linux can be used to count the number of lines, words, and b
Using SIMD-accelerated character and character-set search, StringZilla, even with slow SSDs, it can be noticeably faster.

```bash
$ time wc enwik9.txt
$ time wc enwik9.txt
13147025 129348346 1000000000 enwik9.txt

real 0m3.562s
Expand All @@ -33,7 +33,7 @@ real 0m6.424s
user 0m0.179s
sys 0m0.663s

$ time cli/split.py 100000 enwik9.txt ...
$ time cli/split.py -l 100000 enwik9.txt ...

real 0m1.482s
user 0m1.020s
Expand All @@ -45,4 +45,4 @@ sys 0m0.460s
What other interfaces should be added?

- Levenshtein distances?
- Fuzzy search?
- Fuzzy search?
99 changes: 66 additions & 33 deletions cli/split.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,104 @@
#!/usr/bin/env python3

import argparse
import sys

import stringzilla
from stringzilla import File, Str


def split_file(file_path, lines_per_file, output_prefix):
def parse_arguments():
parser = argparse.ArgumentParser(
description="Output pieces of FILE to PREFIXaa, PREFIXab, ...; default size is 1000 lines, and default PREFIX is 'x'."
)
parser.add_argument(
"file", nargs="?", default="-", help='File to process, "-" for standard input'
)
parser.add_argument(
"prefix", nargs="?", default="X", help='Output file prefix, default is "x"'
)
parser.add_argument(
"-l",
"--lines",
type=int,
default=1000,
help="Number of lines per output file, default is 1000",
)
parser.add_argument(
"-t",
"--separator",
default="\n",
help="Use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character",
)
parser.add_argument(
"-n",
"--number",
type=int,
default=None,
help="Generate N output files based on size of input",
)
parser.add_argument("--version", action="version", version=stringzilla.__version__)
return parser.parse_args()


def split_file(file_path, lines_per_file, output_prefix, separator, number_of_files):
try:
# 1. Memory-map the large file
file_mapped = File(file_path)
file_contents = Str(file_mapped)

# Variables to keep track of the current position and file part number
if separator == "\\0":
separator = "\0"
if file_path == "-":
file_contents = Str(sys.stdin.read())
else:
file_mapped = File(file_path)
file_contents = Str(file_mapped)

if number_of_files is not None:
total_length = len(file_contents)
chunk_size = total_length // number_of_files
for file_part in range(number_of_files):
start = file_part * chunk_size
end = (
start + chunk_size
if file_part < number_of_files - 1
else total_length
)
current_slice = file_contents[start:end]
output_path = f"{output_prefix}{file_part}"
current_slice.write_to(output_path)
return
current_position = 0
file_part = 0
newline_position = (
-1
) # Start before file begins to find the first newline correctly
newline_position = -1

# Loop until the end of the file
while current_position < len(file_contents):
# 2. Loop to skip `lines_per_file` lines
for _ in range(lines_per_file):
newline_position = file_contents.find("\n", newline_position + 1)
if newline_position == -1: # No more newlines
newline_position = file_contents.find(separator, newline_position + 1)
if newline_position == -1:
break

# If no newlines were found and we're not at the start, process the rest of the file
if newline_position == -1 and current_position < len(file_contents):
newline_position = len(file_contents)

# 3. Use offset_within to get the length of the current section
# Assuming offset_within gives you the length from the current position
section_length = (
newline_position - current_position if newline_position != -1 else 0
)

# Extract the current section to write out
if section_length > 0: # Prevent creating empty files
if section_length > 0:
current_slice = file_contents[current_position : newline_position + 1]

# 4. Save the current slice to file
output_path = f"{output_prefix}{file_part}"
current_slice.write_to(output_path)

# Prepare for the next slice
file_part += 1
current_position = newline_position + 1

except FileNotFoundError:
print(f"No such file: {file_path}")
except Exception as e:
print(f"An error occurred: {e}")
print("Usage example: split.py [-l LINES] [file] [prefix]")


def main():
if len(sys.argv) < 4:
print(
"Usage: python split_file.py <lines_per_file> <input_file> <output_prefix>"
)
sys.exit(1)

lines_per_file = int(sys.argv[1])
file_path = sys.argv[2]
output_prefix = sys.argv[3]

split_file(file_path, lines_per_file, output_prefix)
args = parse_arguments()
split_file(args.file, args.lines, args.prefix, args.separator, args.number)


if __name__ == "__main__":
Expand Down
122 changes: 101 additions & 21 deletions cli/wc.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,116 @@
#!/usr/bin/env python3

import sys

import argparse
import stringzilla
from stringzilla import File, Str


def wc(file_path):
try:
mapped_file = File(file_path)
mapped_bytes = Str(mapped_file)
line_count = mapped_bytes.count("\n")
word_count = mapped_bytes.count(" ")
char_count = mapped_bytes.__len__()
def parse_arguments():
parser = argparse.ArgumentParser(
description="Print newline, word, and byte counts for each FILE, and a total line if more than one FILE is \
specified. A word is a non-zero-length sequence of acters delimited by white space."
)
parser.add_argument("files", nargs="*", default=["-"], help="Files to process")
parser.add_argument(
"-c", "--bytes", action="store_true", help="print the byte counts"
)
parser.add_argument(
"-m", "--chars", action="store_true", help="print the character counts"
)
parser.add_argument(
"-l", "--lines", action="store_true", help="print the newline counts"
)
parser.add_argument(
"-L",
"--max-line-length",
action="store_true",
help="print the maximum display width",
)
parser.add_argument(
"-w", "--words", action="store_true", help="print the word counts"
)
parser.add_argument("--version", action="version", version=stringzilla.__version__)
return parser.parse_args()


def wc(file_path, args):
if file_path == "-": # read from stdin
content = sys.stdin.read()
mapped_bytes = Str(content)
else:
try:
mapped_file = File(file_path)
mapped_bytes = Str(mapped_file)
except RuntimeError: # File gives a RuntimeError if the file does not exist
return f"No such file: {file_path}", False

line_count = mapped_bytes.count("\n")
word_count = mapped_bytes.count(" ") + 1
char_count = mapped_bytes.__len__()
counts = {
"line_count": line_count,
"word_count": word_count,
"char_count": char_count,
}

if args.max_line_length:
max_line_length = max(len(line) for line in str(mapped_bytes).split("\n"))
counts["max_line_length"] = max_line_length

return line_count, word_count, char_count
except FileNotFoundError:
return f"No such file: {file_path}"
if args.bytes or args.chars:
byte_count = char_count # assume 1 char = 1 byte
counts["byte_count"] = byte_count

return counts, True


def format_output(counts, args):
selected_counts = []
if args.lines:
selected_counts.append(counts["line_count"])
if args.words:
selected_counts.append(counts["word_count"])
if args.chars:
selected_counts.append(counts["char_count"])
if args.bytes:
selected_counts.append(counts.get("byte_count", counts["char_count"]))
if args.max_line_length:
selected_counts.append(counts.get("max_line_length", 0))

if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
selected_counts = [
counts["line_count"],
counts["word_count"],
counts["char_count"],
]

return " ".join(str(count) for count in selected_counts)


def main():
if len(sys.argv) < 2:
print("Usage: python wc.py <file>")
sys.exit(1)
args = parse_arguments()
total_counts = {
"line_count": 0,
"word_count": 0,
"char_count": 0,
"max_line_length": 0,
"byte_count": 0,
}

file_path = sys.argv[1]
counts = wc(file_path)
for file_path in args.files:
counts, success = wc(file_path, args)
if success:
for key in total_counts.keys():
total_counts[key] += counts.get(key, 0)
output = format_output(counts, args) + f" {file_path}"
print(output)
else:
print(counts)

if isinstance(counts, tuple):
line_count, word_count, char_count = counts
print(f"{line_count} {word_count} {char_count} {file_path}")
else:
print(counts)
if len(args.files) > 1:
total_output = format_output(total_counts, args) + " total"
print(total_output)


if __name__ == "__main__":
Expand Down

0 comments on commit c878caf

Please sign in to comment.