Skip to content

Commit

Permalink
Add ISBN 10 to 13 converter bot
Browse files Browse the repository at this point in the history
  • Loading branch information
davidscotson committed Jun 29, 2023
1 parent cbeb30b commit 44412f2
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 0 deletions.
13 changes: 13 additions & 0 deletions isbn10to13bot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
A set of scripts to add isbn_13 values to editions with valid isbn_10.
### How To Use
```bash
# Find Editions with ISBN 10, but no ISBN 13
./find_editions_with_isbn10not13.sh /path/to/ol_dump.txt.gz /path/to/filtered_dump.txt.gz
# Add ISBN 13s converted from the ISBN 10
python isbn_10_to_13.py --dump_path=/path/to/filtered_dump.txt.gz --dry_run=<bool> --limit=<init>
```
If `dry_run` is True, the script will run as normal, but no changes will be saved to OpenLibrary.
This is for debugging purposes. By default, `dry_run` is `True`.
`limit` is the maximum number of changes to OpenLibrary that will occur before the script quits.
By default, `limit` is set to `1`. Setting `limit` to `0` allows unlimited edits.
A log is automatically generated whenever `isbn_10_to_13.py` executes.
17 changes: 17 additions & 0 deletions isbn10to13bot/find_editions_with_isbn_10_not_13.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

if [[ -z $1 ]]
then
echo "No dump file provided"
exit 1
fi
if [[ -z $2 ]]
then
echo "No output file provided"
exit 1
fi

OL_DUMP=$1
OUTPUT=$2

zgrep ^/type/edition $OL_DUMP | grep -E '"isbn_10":' | grep -v -E '"isbn_13":' | pv | gzip > $OUTPUT
80 changes: 80 additions & 0 deletions isbn10to13bot/isbn_10_to_13.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
isbn 10 to isbn 13
NOTE: This script ideally works on an Open Library Dump that only contains editions with an isbn_10 and no isbn_13
"""
import gzip
import json

import isbnlib
import olclient


class ConvertISBN10to13Job(olclient.AbstractBotJob):
def run(self) -> None:
"""Looks for any ISBN 10s to convert to 13"""
self.write_changes_declaration()
header = {"type": 0, "key": 1, "revision": 2, "last_modified": 3, "JSON": 4}
comment = "convert ISBN 10 to 13 using isbnlib"
with gzip.open(self.args.file, "rb") as fin:
for row_num, row in enumerate(fin):
row = row.decode().split("\t")
_json = json.loads(row[header["JSON"]])
if _json["type"]["key"] != "/type/edition":
continue

if "isbn_10" in _json:
isbns_10 = _json.get("isbn_10", None)
else:
# we only update editions with existing isbn 10s
continue

if "isbn_13" in _json:
# we only update editions with no existing isbn 13s (for now at least)
continue

olid = _json["key"].split("/")[-1]
edition = self.ol.Edition.get(olid)
if edition.type["key"] != "/type/edition":
continue

if hasattr(edition, "isbn_13"):
# we only update editions with no existing isbn 13s (for now at least)
continue

isbns_13 = []
for isbn in isbns_10:
canonical = isbnlib.canonical(isbn)
if isbnlib.is_isbn10(canonical):
isbn_13 = isbnlib.to_isbn13(canonical)
if isbnlib.is_isbn13(canonical):
isbn_13 = canonical
if isbn_13:
isbns_13.append(isbn_13)

if len(isbns_13) > 1:
isbns_13 = dedupe(
isbns_13
) # remove duplicates, shouldn't normally be necessary

setattr(edition, "isbn_13", isbns_13)
self.logger.info("\t".join([olid, str(isbns_10), str(isbns_13)]))
self.save(lambda: edition.save(comment=comment))


def dedupe(input_list: list) -> list:
"""Remove duplicate elements in a list and return the new list"""
output = []
for i in input_list:
if i not in output:
output.append(i)
return output


if __name__ == "__main__":
job = ConvertISBN10to13Job()

try:
job.run()
except Exception as e:
job.logger.exception(e)
raise e
2 changes: 2 additions & 0 deletions isbn10to13bot/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
openlibrary-client==0.0.30
isbnlib==3.10.14

0 comments on commit 44412f2

Please sign in to comment.