Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standard-compliant ws and split implementation (Issue 97 98) #99

Merged
merged 2 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The `wc` utility on Linux can be used to count the number of lines, words, and b
Using SIMD-accelerated character and character-set search, StringZilla, even with slow SSDs, it can be noticeably faster.

```bash
$ time wc enwik9.txt
$ time wc enwik9.txt
13147025 129348346 1000000000 enwik9.txt

real 0m3.562s
Expand All @@ -33,7 +33,7 @@ real 0m6.424s
user 0m0.179s
sys 0m0.663s

$ time cli/split.py 100000 enwik9.txt ...
$ time cli/split.py -l 100000 enwik9.txt ...

real 0m1.482s
user 0m1.020s
Expand All @@ -45,4 +45,4 @@ sys 0m0.460s
What other interfaces should be added?

- Levenshtein distances?
- Fuzzy search?
- Fuzzy search?
99 changes: 66 additions & 33 deletions cli/split.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,104 @@
#!/usr/bin/env python3

import argparse
import sys

import stringzilla
from stringzilla import File, Str


def split_file(file_path, lines_per_file, output_prefix):
def parse_arguments():
parser = argparse.ArgumentParser(
description="Output pieces of FILE to PREFIXaa, PREFIXab, ...; default size is 1000 lines, and default PREFIX is 'x'."
)
parser.add_argument(
"file", nargs="?", default="-", help='File to process, "-" for standard input'
)
parser.add_argument(
"prefix", nargs="?", default="X", help='Output file prefix, default is "x"'
)
parser.add_argument(
"-l",
"--lines",
type=int,
default=1000,
help="Number of lines per output file, default is 1000",
)
parser.add_argument(
"-t",
"--separator",
default="\n",
help="Use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character",
)
parser.add_argument(
"-n",
"--number",
type=int,
default=None,
help="Generate N output files based on size of input",
)
parser.add_argument("--version", action="version", version=stringzilla.__version__)
return parser.parse_args()


def split_file(file_path, lines_per_file, output_prefix, separator, number_of_files):
try:
# 1. Memory-map the large file
file_mapped = File(file_path)
file_contents = Str(file_mapped)

# Variables to keep track of the current position and file part number
if separator == "\\0":
separator = "\0"
if file_path == "-":
file_contents = Str(sys.stdin.read())
else:
file_mapped = File(file_path)
file_contents = Str(file_mapped)

if number_of_files is not None:
total_length = len(file_contents)
chunk_size = total_length // number_of_files
for file_part in range(number_of_files):
start = file_part * chunk_size
end = (
start + chunk_size
if file_part < number_of_files - 1
else total_length
)
current_slice = file_contents[start:end]
output_path = f"{output_prefix}{file_part}"
current_slice.write_to(output_path)
return
current_position = 0
file_part = 0
newline_position = (
-1
) # Start before file begins to find the first newline correctly
newline_position = -1

# Loop until the end of the file
while current_position < len(file_contents):
# 2. Loop to skip `lines_per_file` lines
for _ in range(lines_per_file):
newline_position = file_contents.find("\n", newline_position + 1)
if newline_position == -1: # No more newlines
newline_position = file_contents.find(separator, newline_position + 1)
if newline_position == -1:
break

# If no newlines were found and we're not at the start, process the rest of the file
if newline_position == -1 and current_position < len(file_contents):
newline_position = len(file_contents)

# 3. Use offset_within to get the length of the current section
# Assuming offset_within gives you the length from the current position
section_length = (
newline_position - current_position if newline_position != -1 else 0
)

# Extract the current section to write out
if section_length > 0: # Prevent creating empty files
if section_length > 0:
current_slice = file_contents[current_position : newline_position + 1]

# 4. Save the current slice to file
output_path = f"{output_prefix}{file_part}"
current_slice.write_to(output_path)

# Prepare for the next slice
file_part += 1
current_position = newline_position + 1

except FileNotFoundError:
print(f"No such file: {file_path}")
except Exception as e:
print(f"An error occurred: {e}")
print("Usage example: split.py [-l LINES] [file] [prefix]")


def main():
if len(sys.argv) < 4:
print(
"Usage: python split_file.py <lines_per_file> <input_file> <output_prefix>"
)
sys.exit(1)

lines_per_file = int(sys.argv[1])
file_path = sys.argv[2]
output_prefix = sys.argv[3]

split_file(file_path, lines_per_file, output_prefix)
args = parse_arguments()
split_file(args.file, args.lines, args.prefix, args.separator, args.number)


if __name__ == "__main__":
Expand Down
122 changes: 101 additions & 21 deletions cli/wc.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,116 @@
#!/usr/bin/env python3

import sys

import argparse
import stringzilla
from stringzilla import File, Str


def wc(file_path):
try:
mapped_file = File(file_path)
mapped_bytes = Str(mapped_file)
line_count = mapped_bytes.count("\n")
word_count = mapped_bytes.count(" ")
char_count = mapped_bytes.__len__()
def parse_arguments():
parser = argparse.ArgumentParser(
description="Print newline, word, and byte counts for each FILE, and a total line if more than one FILE is \
specified. A word is a non-zero-length sequence of acters delimited by white space."
)
parser.add_argument("files", nargs="*", default=["-"], help="Files to process")
parser.add_argument(
"-c", "--bytes", action="store_true", help="print the byte counts"
)
parser.add_argument(
"-m", "--chars", action="store_true", help="print the character counts"
)
parser.add_argument(
"-l", "--lines", action="store_true", help="print the newline counts"
)
parser.add_argument(
"-L",
"--max-line-length",
action="store_true",
help="print the maximum display width",
)
parser.add_argument(
"-w", "--words", action="store_true", help="print the word counts"
)
parser.add_argument("--version", action="version", version=stringzilla.__version__)
return parser.parse_args()


def wc(file_path, args):
if file_path == "-": # read from stdin
content = sys.stdin.read()
mapped_bytes = Str(content)
else:
try:
mapped_file = File(file_path)
mapped_bytes = Str(mapped_file)
except RuntimeError: # File gives a RuntimeError if the file does not exist
return f"No such file: {file_path}", False

line_count = mapped_bytes.count("\n")
word_count = mapped_bytes.count(" ") + 1
char_count = mapped_bytes.__len__()
counts = {
"line_count": line_count,
"word_count": word_count,
"char_count": char_count,
}

if args.max_line_length:
max_line_length = max(len(line) for line in str(mapped_bytes).split("\n"))
counts["max_line_length"] = max_line_length

return line_count, word_count, char_count
except FileNotFoundError:
return f"No such file: {file_path}"
if args.bytes or args.chars:
byte_count = char_count # assume 1 char = 1 byte
counts["byte_count"] = byte_count

return counts, True


def format_output(counts, args):
selected_counts = []
if args.lines:
selected_counts.append(counts["line_count"])
if args.words:
selected_counts.append(counts["word_count"])
if args.chars:
selected_counts.append(counts["char_count"])
if args.bytes:
selected_counts.append(counts.get("byte_count", counts["char_count"]))
if args.max_line_length:
selected_counts.append(counts.get("max_line_length", 0))

if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
selected_counts = [
counts["line_count"],
counts["word_count"],
counts["char_count"],
]

return " ".join(str(count) for count in selected_counts)


def main():
if len(sys.argv) < 2:
print("Usage: python wc.py <file>")
sys.exit(1)
args = parse_arguments()
total_counts = {
"line_count": 0,
"word_count": 0,
"char_count": 0,
"max_line_length": 0,
"byte_count": 0,
}

file_path = sys.argv[1]
counts = wc(file_path)
for file_path in args.files:
counts, success = wc(file_path, args)
if success:
for key in total_counts.keys():
total_counts[key] += counts.get(key, 0)
output = format_output(counts, args) + f" {file_path}"
print(output)
else:
print(counts)

if isinstance(counts, tuple):
line_count, word_count, char_count = counts
print(f"{line_count} {word_count} {char_count} {file_path}")
else:
print(counts)
if len(args.files) > 1:
total_output = format_output(total_counts, args) + " total"
print(total_output)


if __name__ == "__main__":
Expand Down
Loading