Skip to content

Commit

Permalink
Merge pull request #318 from veraPDF/headings_list
Browse files Browse the repository at this point in the history
Fix case when headings detected as list
  • Loading branch information
MaximPlusov authored Dec 7, 2023
2 parents a4361f4 + b2a138c commit 894ed1f
Show file tree
Hide file tree
Showing 13 changed files with 738 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

public class PDFList extends InfoChunk {

private final double LIST_ITEM_EPSILON = 0.2;
private static final double LIST_ITEM_EPSILON = 0.2;

private final List<ListItem> listItems;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ private List<INode> recognize() {

if (recognizedTable != null) {
if (recognizedTable.getTableBorder() == null && ListUtils.isList(recognizedTable)) {
PDFList list = new PDFList(recognizedTable);
lists.add(list);
if (!HeadingUtils.isHeadings(recognizedTable)) {
PDFList list = new PDFList(recognizedTable);
lists.add(list);
}
} else if (checkTable(recognizedTable)) {
tables.add(recognizedTable);
}
Expand Down Expand Up @@ -424,7 +426,7 @@ private INode updateTreeWithRecognizedTableRow(Table table, TableRow row, TableR
Long id = table.getId();
Map<INode, Integer> cellNodes = new HashMap<>();
for (int i = 0; i < row.getCells().size(); i++) {
INode cellNode = updateTreeWithRecognizedCell(row.getCells().get(i));
INode cellNode = getTableCellNode(row.getCells().get(i));

if (cellNode != null) {
cellNodes.put(cellNode, i);
Expand Down Expand Up @@ -469,7 +471,7 @@ private boolean isHeaderCell(INode cellNode, Integer columnNumber, TableCell pre
return false;
}

private INode updateTreeWithRecognizedCell(TableCell cell) {
public static INode getTableCellNode(TableCell cell) {
Set<INode> tableLeafNodes = new HashSet<>();
for (TableTokenRow tokenRow : cell.getContent()) {
for (TextChunk chunk : tokenRow.getTextChunks()) {
Expand Down Expand Up @@ -535,7 +537,7 @@ private INode updateTreeWithRecognizedList(PDFList list) {

private boolean updateNode(INode node, Long id, SemanticType semanticType, boolean hasTableBorder,
BoundingBox boundingBox) {
if ((((ListUtils.isListNode(node) && !hasTableBorder) || TableUtils.isTableNode(node)) &&
if ((((ListUtils.isDetectedListNode(node) && !hasTableBorder) || TableUtils.isTableNode(node)) &&
node.getRecognizedStructureId() != id) || (semanticType != SemanticType.TABLE && !isNodeInsideTable(node,
id, boundingBox, semanticType))) {
node.setRecognizedStructureId(null);
Expand Down Expand Up @@ -639,7 +641,7 @@ private INode updateTreeWithRecognizedListElement(ListElement listElement) {
return findLocalRoot(tableLeafNodes);
}

private INode findLocalRoot(Set<INode> nodes) {
public static INode findLocalRoot(Set<INode> nodes) {

INode localRoot = null;
for (INode node : nodes) {
Expand Down Expand Up @@ -681,7 +683,7 @@ private boolean isAncestorFor(INode first, INode second) {
return false;
}

private void initTreeCounters(INode root) {
private static void initTreeCounters(INode root) {
if (root == null) {
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ private void acceptChildrenSemanticHeading(INode node) {
boolean singleChild = false;
if (children.size() == 1) {
INode child = children.get(0);
if (child.getInitialSemanticType() == SemanticType.HEADING ||
child.getInitialSemanticType() == SemanticType.NUMBER_HEADING) {
if (HeadingUtils.isInitialHeadingNode(child)) {
INode nextNode = getNextNonEmptyTextNode(child);
singleChild = true;
if (nextNode != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import org.verapdf.wcag.algorithms.entities.geometry.MultiBoundingBox;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class SemanticDocumentPostprocessingConsumer extends WCAGConsumer {
Expand Down Expand Up @@ -120,7 +119,7 @@ private void setLowestDepthErrorFlag(INode node) {
node.setHasLowestDepthError();
return;
}
if ((ListUtils.isListNode(node) || ListUtils.isInitialListNode(node)) &&
if ((ListUtils.isDetectedListNode(node) || ListUtils.isInitialListNode(node)) &&
node.getSemanticType() != node.getInitialSemanticType()) {
node.setHasLowestDepthError();
return;
Expand All @@ -142,8 +141,7 @@ private boolean isTextNode(INode node) {
}

private boolean isTitle(INode node) {
return SemanticType.HEADING.equals(node.getSemanticType()) && !SemanticType.HEADING.equals(node.getInitialSemanticType()) ||
SemanticType.NUMBER_HEADING.equals(node.getSemanticType()) && !SemanticType.NUMBER_HEADING.equals(node.getInitialSemanticType());
return HeadingUtils.isDetectedHeadingNode(node) && !HeadingUtils.isInitialHeadingNode(node);
}

private void checkRepeatedAndAdd(List<TextChunk> textChunks) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -633,8 +633,7 @@ private boolean findHeading(INode node, String text, int pageNumber) {
if (node == null) {
return false;
}
if (node.getInitialSemanticType() == SemanticType.NUMBER_HEADING ||
node.getInitialSemanticType() == SemanticType.HEADING) {
if (HeadingUtils.isInitialHeadingNode(node)) {
textValue = getTextChunks(node, pageNumber).stream()
.map(TextChunk::getValue).collect(Collectors.joining(""))
.replaceAll(NON_CONTENT_REGEX, "").toUpperCase();
Expand All @@ -651,7 +650,7 @@ private boolean findHeading(INode node, String text, int pageNumber) {
}
return false;
}
if (node.getSemanticType() == SemanticType.HEADING || node.getSemanticType() == SemanticType.NUMBER_HEADING) {
if (HeadingUtils.isDetectedHeadingNode(node)) {
return true;
}
INode accumulatedNode = StaticContainers.getAccumulatedNodeMapper().get(node);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ public static double imageCaptionProbability(INode node, SemanticFigure imageNod
if (node == null) {
return 0;
}
if (node.getSemanticType() == SemanticType.HEADING ||
node.getSemanticType() == SemanticType.NUMBER_HEADING) {
if (HeadingUtils.isDetectedHeadingNode(node)) {
return 0;
}
INode accumulatedNode = StaticContainers.getAccumulatedNodeMapper().get(node);
Expand Down Expand Up @@ -57,9 +56,7 @@ public static double tableCaptionProbability(INode node, BoundingBox tableBoundi
if (node == null) {
return 0.0;
}
if (node.getSemanticType() == SemanticType.HEADING ||
node.getSemanticType() == SemanticType.NUMBER_HEADING ||
node.getSemanticType() == SemanticType.LIST) {
if (HeadingUtils.isDetectedHeadingNode(node) || node.getSemanticType() == SemanticType.LIST) {
return 0.0;
}
INode accumulatedNode = StaticContainers.getAccumulatedNodeMapper().get(node);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class ChunksMergeUtils {
private static final double FONT_WHITESPACE_COMPARISON_THRESHOLD = 0.33;
private static final double FONT_LEADING_INTERVAL_STANDARD = 1;
private static final double[] DEFAULT_FONT_CHAR_SPACING_INTERVAL = {0, 0.67};
private static final double[] DEFAULT_FONT_LEADING_INTERVAL = {0.7, 1.5};
private static final double[] DEFAULT_FONT_LEADING_INTERVAL = {0.7, 1.51};
private static final double[] PART_FONT_LEADING_INTERVAL = {0.2, 1.5};

private static final double TO_LINE_PROBABILITY_THRESHOLD = 0.75;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.verapdf.wcag.algorithms.semanticalgorithms.utils;

import org.verapdf.wcag.algorithms.entities.INode;
import org.verapdf.wcag.algorithms.entities.enums.SemanticType;
import org.verapdf.wcag.algorithms.entities.tables.*;
import org.verapdf.wcag.algorithms.semanticalgorithms.consumers.ClusterTableConsumer;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class HeadingUtils {

private static final Set<SemanticType> headingSemanticTypes = new HashSet<>(Arrays.asList(
SemanticType.HEADING, SemanticType.NUMBER_HEADING));

public static boolean isHeadings(Table table) {
for (TableRow row : table.getRows()) {
List<TableCell> cells = row.getCells();
if (cells.size() != 2) {
return false;
}
TableCell cell = cells.get(0);
if (cell.isTextCell() && !cell.getContent().isEmpty()) {
Set<INode> nodes = new HashSet<>();
nodes.add(ClusterTableConsumer.getTableCellNode(cell));
nodes.add(ClusterTableConsumer.getTableCellNode(cells.get(1)));
INode node = ClusterTableConsumer.findLocalRoot(nodes);
boolean isHeading = false;
while (node != null) {
if (isInitialHeadingNode(node) || node.getInitialSemanticType() == SemanticType.TITLE) {
isHeading = true;
break;
}
node = node.getParent();
}
if (!isHeading) {
return false;
}
}
}
return true;
}

public static boolean isDetectedHeadingNode(INode node) {
return headingSemanticTypes.contains(node.getSemanticType());
}

public static boolean isInitialHeadingNode(INode node) {
return headingSemanticTypes.contains(node.getInitialSemanticType());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public class ListUtils {
SemanticType.LIST, SemanticType.LIST_ITEM,
SemanticType.LIST_LABEL, SemanticType.LIST_BODY));

public static boolean isListNode(INode node) {
public static boolean isDetectedListNode(INode node) {
return listSemanticTypes.contains(node.getSemanticType());
}

Expand Down Expand Up @@ -190,8 +190,7 @@ public static void checkChildrenListInterval(ListIntervalsCollection listInterva
private static boolean isContainsHeading(INode node) {
INode currentNode = node;
while (currentNode.getPageNumber() != null) {
if ((currentNode.getSemanticType() == SemanticType.HEADING ||
currentNode.getSemanticType() == SemanticType.NUMBER_HEADING) &&
if (HeadingUtils.isDetectedHeadingNode(currentNode) &&
currentNode.getCorrectSemanticScore() >= NodeUtils.MIN_GOOD_HEADING_PROBABILITY) {
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ public static double headingProbability(SemanticTextNode textNode, SemanticTextN
if (textNode.isStartsWithArabicNumber()) {
headingProbability += HEADING_PROBABILITY_PARAMS[2];
}
SemanticType initialSemanticType = initialNode.getInitialSemanticType();
if (SemanticType.HEADING.equals(initialSemanticType) || SemanticType.NUMBER_HEADING.equals(initialSemanticType)) {
if (HeadingUtils.isInitialHeadingNode(initialNode)) {
headingProbability += HEADING_PROBABILITY_PARAMS[3];
}
INode nextNeighbor = getNextNonEmptyNode(initialNode);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class AccumulatedNodeSemanticsCheckerTests {

static {
ignoredSemanticTypes.add(SemanticType.LINK);
ignoredSemanticTypes.add(SemanticType.ANNOT);
ignoredSemanticTypes.add(SemanticType.FIGURE);
ignoredSemanticTypes.add(SemanticType.FORM);
ignoredSemanticTypes.add(SemanticType.TABLE);
Expand Down Expand Up @@ -80,6 +79,7 @@ void testSemanticCorrectness(String jsonPdfPath, double probability, SemanticTyp

static Stream<Arguments> treeSemanticCorrectnessTestParams() {
return Stream.of(
Arguments.of("annots/annot1.json"),
Arguments.of("headings/Heading1.json"),
Arguments.of("headings/Heading2.json"),
Arguments.of("headings/Heading3.json"),
Expand Down Expand Up @@ -115,6 +115,7 @@ static Stream<Arguments> treeSemanticCorrectnessTestParams() {
Arguments.of("spans/Span1.json"),
Arguments.of("paragraphs/one_line_vertical_paragraph.json"),
Arguments.of("paragraphs/paragraph1.json"),
Arguments.of("paragraphs/paragraph2.json"),
Arguments.of("paragraphs/paragraph_arrows.json"),
Arguments.of("paragraphs/lwg_footnote.json"),
Arguments.of("paragraphs/paragraph_not_caption_danish.json"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ void testListDetection(String filename, int[] checkSizes, boolean semanticIsVali

private void testListTreeStructure(ITree tree) {
for (INode node : tree) {
if (ListUtils.isListNode(node) && !node.isLeaf()) {
if (ListUtils.isDetectedListNode(node) && !node.isLeaf()) {
Assertions.assertEquals(node.getInitialSemanticType(), node.getSemanticType());
}
}
Expand Down
Loading

0 comments on commit 894ed1f

Please sign in to comment.