diff --git a/isbnfrombwbbot/README.md b/isbnfrombwbbot/README.md new file mode 100644 index 00000000..f4e47696 --- /dev/null +++ b/isbnfrombwbbot/README.md @@ -0,0 +1,13 @@ +A set of scripts to add isbn_13 values to editions with BWB references containing one. +### How To Use +```bash +# Find Editions with BWB ISBN, but no ISBN 13 + ./find_editions_with_isbnbwbnot13.sh /path/to/ol_dump.txt.gz /path/to/filtered_dump.txt.gz +# Add ISBN 13s converted from the bwb source +python isbn_bwb_to_13.py --dump_path=/path/to/filtered_dump.txt.gz --dry_run= --limit= +``` +If `dry_run` is True, the script will run as normal, but no changes will be saved to OpenLibrary. +This is for debugging purposes. By default, `dry_run` is `True`. +`limit` is the maximum number of changes to OpenLibrary that will occur before the script quits. +By default, `limit` is set to `1`. Setting `limit` to `0` allows unlimited edits. +A log is automatically generated whenever `isbn_bwb_to_13.py` executes. diff --git a/isbnfrombwbbot/find_editions_with_isbn_bwb_not_13.sh b/isbnfrombwbbot/find_editions_with_isbn_bwb_not_13.sh new file mode 100755 index 00000000..15d74083 --- /dev/null +++ b/isbnfrombwbbot/find_editions_with_isbn_bwb_not_13.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +if [[ -z $1 ]] + then + echo "No dump file provided" + exit 1 +fi +if [[ -z $2 ]] + then + echo "No output file provided" + exit 1 +fi + +OL_DUMP=$1 +OUTPUT=$2 + +zgrep ^/type/edition $OL_DUMP | + grep -E '"bwb:\d{13}"' | + grep -v -E '"isbn_13":' | + grep -v -E '"isbn_10"' | + pv | + gzip > $OUTPUT diff --git a/isbnfrombwbbot/isbn_bwb_to_13.py b/isbnfrombwbbot/isbn_bwb_to_13.py new file mode 100644 index 00000000..5c4475dc --- /dev/null +++ b/isbnfrombwbbot/isbn_bwb_to_13.py @@ -0,0 +1,66 @@ +""" +BWB isbn ref to isbn 13 +NOTE: This script ideally works on an Open Library Dump that only contains editions with an BWB isbn ref and no isbn_13 +""" +import gzip +import json +import re + +import isbnlib +import olclient + + +class ConvertISBNbwbto13Job(olclient.AbstractBotJob): + def run(self) -> None: + """Looks for any BWB ISBN to convert to 13""" + self.write_changes_declaration() + header = {"type": 0, "key": 1, "revision": 2, "last_modified": 3, "JSON": 4} + comment = "extract ISBN 13 from BWB source_record" + with gzip.open(self.args.file, "rb") as fin: + for row_num, row in enumerate(fin): + row = row.decode().split("\t") + _json = json.loads(row[header["JSON"]]) + if _json["type"]["key"] != "/type/edition": + continue + + if hasattr(_json, "isbn_13"): + # we only update editions with no existing isbn 13s (for now at least) + continue + + if "source_records" in _json: + source_records = _json.get("source_records", None) + else: + continue + regex = "bwb:[0-9]{13}" + for source_record in source_records: + if re.fullmatch(regex, source_record): + isbn_13 = source_record.split(":")[1] + break + + if not isbnlib.is_isbn13(isbn_13): + continue + + olid = _json["key"].split("/")[-1] + edition = self.ol.Edition.get(olid) + if edition.type["key"] != "/type/edition": + continue + + if hasattr(edition, "isbn_13"): + # don't update editions that already have an isbn 13 + continue + + isbns_13 = [isbn_13] + + setattr(edition, "isbn_13", isbns_13) + self.logger.info("\t".join([olid, source_record, str(isbns_13)])) + self.save(lambda: edition.save(comment=comment)) + + +if __name__ == "__main__": + job = ConvertISBNbwbto13Job() + + try: + job.run() + except Exception as e: + job.logger.exception(e) + raise e diff --git a/isbnfrombwbbot/requirements.txt b/isbnfrombwbbot/requirements.txt new file mode 100644 index 00000000..c9202c4b --- /dev/null +++ b/isbnfrombwbbot/requirements.txt @@ -0,0 +1,2 @@ +openlibrary-client==0.0.30 +isbnlib==3.10.14