Skip to content

Commit

Permalink
PageXML: respect --output_dir
Browse files Browse the repository at this point in the history
  • Loading branch information
bertsky committed Oct 1, 2024
1 parent 9b4473f commit 830648c
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions calamari_ocr/ocr/dataset/datareader/pagexml/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,13 @@ def cutout(
def prepare_store(self):
self._last_page_id = None
self._next_word_id = 0
self._output_dir = dict()

def store_text_prediction(self, prediction, sample_id, output_dir):
sentence = prediction.sentence
sample = self.sample_by_id(sample_id)
output_dir = output_dir or os.path.dirname(sample['page_id'])
self._output_dir[sample['page_id']] = output_dir
ns = sample["ns"]
line = sample["xml_element"]
textequivxml = line.find('./ns:TextEquiv[@index="{}"]'.format(self.params.text_index), namespaces=ns)
Expand Down Expand Up @@ -440,8 +443,11 @@ def store(self):
desc="Writing PageXML files",
total=len(self.params.xmlfiles),
):
page = self.pages(split_all_ext(xml)[0])
with open(split_all_ext(xml)[0] + extension, "w", encoding="utf-8") as f:
page_id = split_all_ext(xml)[0]
page = self.pages(page_id)
path = os.path.join(self._output_dir[page_id],
filename(xml) + extension)
with open(path, "w", encoding="utf-8") as f:
f.write(etree.tounicode(page.getroottree(), pretty_print=True))

@staticmethod
Expand Down Expand Up @@ -619,7 +625,9 @@ def _words_from_prediction(prediction: Prediction) -> list:

def _store_page(self, extension, page_id):
page = self.pages[page_id]
with open(split_all_ext(page_id)[0] + extension, "w", encoding="utf-8") as f:
path = os.path.join(self._output_dir[page_id],
filename(page_id) + extension)
with open(path, "w", encoding="utf-8") as f:
f.write(etree.tounicode(page.getroottree(), pretty_print=True))

def _sample_iterator(self):
Expand Down

0 comments on commit 830648c

Please sign in to comment.