Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for --wrap option #129

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 47 additions & 7 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import signal
import sys
import tempfile
import textwrap
import unicodedata
import warnings
from collections import defaultdict
Expand Down Expand Up @@ -137,7 +138,8 @@ def find_locale_dir():


def make_bag(
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None,
encoding="utf-8", line_length=0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether line_length should be something like max_tag_line_length so it's a little more obvious what it applies to

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree line_length is a bit too general. In thinking what would make sense with an option named --wrap-tags (c.f. conversation above), I worry max_tag_line_length might be too much:

  --wrap-tags MAX_TAG_LINE_LENGTH
                        Limit line lengths in the tag file (bag-info.txt) by
                        wrapping long values and indenting subsequent lines
                        with a space character. (Default: don't.)

What about tag_wrap_column? That leads to:

  --wrap-tags TAG_WRAP_COLUMN
                        Limit line lengths in the tag file (bag-info.txt) by
                        wrapping long values and indenting subsequent lines
                        with a space character. (Default: don't.)

):
"""
Convert a given directory into a bag. You can pass in arbitrary
Expand Down Expand Up @@ -256,7 +258,7 @@ def make_bag(
)

bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)
_make_tag_file("bag-info.txt", bag_info)
_make_tag_file("bag-info.txt", bag_info, line_length)

for c in checksums:
_make_tagmanifest_file(c, bag_dir, encoding="utf-8")
Expand Down Expand Up @@ -450,7 +452,7 @@ def payload_entries(self):
if key.startswith("data" + os.sep)
)

def save(self, processes=1, manifests=False):
def save(self, processes=1, manifests=False, line_length=0):
"""
save will persist any changes that have been made to the bag
metadata (self.info).
Expand All @@ -463,6 +465,11 @@ def save(self, processes=1, manifests=False):

If you want to control the number of processes that are used when
recalculating checksums use the processes parameter.

If you want long tag values to be wrapped by breaking long strings at
whitespace characters, set line_length to a value greater than 0. An
integer value greater than 0 causes line-wrapping to be performed on
a best-effort basis to limit line lengths to the given value.
"""
# Error checking
if not self.path:
Expand Down Expand Up @@ -514,7 +521,7 @@ def save(self, processes=1, manifests=False):
LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name)
self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)

_make_tag_file(self.tag_file_name, self.info)
_make_tag_file(self.tag_file_name, self.info, line_length)

# Update tag-manifest for changes to manifest & bag-info files
for alg in self.algorithms:
Expand Down Expand Up @@ -1219,16 +1226,37 @@ def _parse_tags(tag_file):
yield (tag_name, tag_value.strip())


def _make_tag_file(bag_info_path, bag_info):
def _make_tag_file(bag_info_path, bag_info, line_length):
headers = sorted(bag_info.keys())
with open_text_file(bag_info_path, "w") as f:
for h in headers:
values = bag_info[h]
if not isinstance(values, list):
values = [values]
for txt in values:
# strip CR, LF and CRLF so they don't mess up the tag file
txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt))
txt = force_unicode(txt)
if line_length > 1:
# Account for colon & space written after the property name.
prop_width = len(h) + 2
first_break = prop_width + len(txt.split(None, 1)[0])
if line_length <= first_break:
# Start value on the next line.
txt = '\n'.join(textwrap.wrap(txt, width=line_length,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two calls are almost identical but it takes time to confirm that. Do you think it would be worth assigning initial_indent first so we can have an identical textwrap call? I'm also wondering whether this block of code is now long enough that we should break it into a separate function so this would just be for txt in values: txt = wrap_tag(h, txt, line_length)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the duplication: although initial_indent could be pulled out so that there's only one call to wrap, the problem is that there's a 2nd step after the call to wrap that depends on the same condition (see line 1257). So, the resulting alternative code would have a conditional for setting initial_indent, followed by the call to wrap, followed by an identical conditional to adjust the output of wrap. I find this alternative less readable than the current code (I tried it to see), although I realize this may be a matter of personal preference. Still, what about simply moving the parameter initial_indent higher up in the call to wrap, so that it's easier to spot the difference quickly?

Regarding splitting it out to a separate function: agreed. However, I might recommend naming the function differently, if you don't object, because in this case it wouldn't necessarily wrap the tag value. So perhaps something like _format_tag_value?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went ahead and implemented the changes discussed above in the latest commit.

break_long_words=False,
break_on_hyphens=False,
initial_indent='\n ',
subsequent_indent=' '))
else:
# Account for tag name by temporarily adding a leading
# space before calling wrap(), then removing the space.
txt = '\n'.join(textwrap.wrap(txt, width=line_length,
break_long_words=False,
break_on_hyphens=False,
initial_indent=' '*prop_width,
subsequent_indent=' '))
txt = txt[prop_width:]
else:
txt = re.sub(r"\n|\r|(\r\n)", "", txt)
f.write("%s: %s\n" % (h, txt))


Expand Down Expand Up @@ -1499,6 +1527,17 @@ def _make_parser():
" without performing checksum validation to detect corruption."
),
)
parser.add_argument(
"--wrap",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea but I was wondering whether “wrap” might be a little too generic a name for the option. What do you think about something like --wrap-tags or --max-tag-line-length?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I struggled with naming the option! The current result was the outcome of trying to satisfy two goals: (1) emulating the existing bagit.py style of using mostly single words (for those options other than the metadata fields), and (2) accounting for the fact that the argument parser displays the associated variable name as part of the help text. The latter means that one ends up having to think about the argument name and the variable jointly. The current combination leads to the following output for bagit.py -h:

--wrap LINE_LENGTH    Limit line lengths in the tag file (bag-info.txt) by
                      wrapping long values and indenting subsequent lines
                      with a space character. (Default: don't.)

I had hoped this was clear enough, but I have to agree that by itself, the current combination leaves ambiguous what is being wrapped. I'm fine with changing it. I lean towards --wrap-tags.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on --wrap-tags. I'd prefer clarity to single words and with that we don't to touch either dest or add a metavar option.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Just mentioning it in case it wasn't obvious: all these discussed changes are in the update to the PR made a few days ago.)

type=int,
dest="line_length",
default=0,
help=_(
"Limit line lengths in the tag file (bag-info.txt) by"
" wrapping long values and indenting subsequent lines with a"
" space character. (Default: don't.)"
),
)

checksum_args = parser.add_argument_group(
_("Checksum Algorithms"),
Expand Down Expand Up @@ -1596,6 +1635,7 @@ def main():
bag_info=args.bag_info,
processes=args.processes,
checksums=args.checksums,
line_length=args.line_length,
)
except Exception as exc:
LOGGER.error(
Expand Down
24 changes: 24 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,30 @@ def test_validate_fast(self):
os.remove(j(self.tmpdir, "data", "loc", "2478433644_2839c5e8b8_o_d.jpg"))
self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=True)

def test_validate_wrapped_tag_lines(self):
info = {"External-Description":
('BagIt is a set of hierarchical file layout conventions'
' designed to support storage and transfer of arbitrary'
' digital content. A bag consists of a directory containing'
' the payload files and other accompanying metadata files'
' known as "tag" files.')}
bag = bagit.make_bag(self.tmpdir, bag_info=info, line_length=79)
self.assertEqual(self.validate(bag, fast=True), True)
os.remove(j(self.tmpdir, "data", "loc", "2478433644_2839c5e8b8_o_d.jpg"))
self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=True)

def test_validate_wrapped_tag_lines_short(self):
info = {"External-Description":
('BagIt is a set of hierarchical file layout conventions'
' designed to support storage and transfer of arbitrary'
' digital content. A bag consists of a directory containing'
' the payload files and other accompanying metadata files'
' known as "tag" files.')}
bag = bagit.make_bag(self.tmpdir, bag_info=info, line_length=18)
self.assertEqual(self.validate(bag, fast=True), True)
os.remove(j(self.tmpdir, "data", "loc", "2478433644_2839c5e8b8_o_d.jpg"))
self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=True)

def test_validate_completeness(self):
bag = bagit.make_bag(self.tmpdir)
old_path = j(self.tmpdir, "data", "README")
Expand Down