From e2f7eed8036c338d4b07ecffc4f4c5406cc32065 Mon Sep 17 00:00:00 2001 From: Maxim Date: Thu, 11 Jul 2024 19:40:26 +0300 Subject: [PATCH] Improve list detection. Support korean numeration --- .../utils/ListLabelsUtils.java | 9 ++- ...anLettersListLabelsDetectionAlgorithm.java | 57 +++++++++++++++++++ .../LettersListLabelsDetectionAlgorithm.java | 4 +- 3 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/KoreanLettersListLabelsDetectionAlgorithm.java diff --git a/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/ListLabelsUtils.java b/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/ListLabelsUtils.java index 060ee561..412b8362 100644 --- a/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/ListLabelsUtils.java +++ b/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/ListLabelsUtils.java @@ -6,10 +6,7 @@ import org.verapdf.wcag.algorithms.entities.lists.ListIntervalsCollection; import org.verapdf.wcag.algorithms.entities.lists.info.ListItemInfo; import org.verapdf.wcag.algorithms.entities.lists.info.ListItemTextInfo; -import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.AlfaLettersListLabelsDetectionAlgorithm1; -import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.AlfaLettersListLabelsDetectionAlgorithm2; -import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.ArabicNumbersListLabelsDetectionAlgorithm; -import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.RomanNumbersListLabelsDetectionAlgorithm; +import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.*; import java.util.*; @@ -17,7 +14,7 @@ public class ListLabelsUtils { private static final Set labels = new HashSet<>( Arrays.asList('\u002D', '\u2022', '\u25CF', '\u2714', '\u2717', '\u2794', '\u27A2', '\uE00A', '\uE00C', - '\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼) + '\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※) private static final Character o = '\u006F'; public static boolean isListLabel(String value) { @@ -70,6 +67,7 @@ public static boolean isListLabels(List listLabels) { } return new RomanNumbersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) || new ArabicNumbersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) || + new KoreanLettersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) || new AlfaLettersListLabelsDetectionAlgorithm1().isListLabels(labels, commonStartLength, commonEndLength) || new AlfaLettersListLabelsDetectionAlgorithm2().isListLabels(labels, commonStartLength, commonEndLength); } @@ -142,6 +140,7 @@ public static Set getListItemsIntervals(List ite ListIntervalsCollection listIntervals = new ListIntervalsCollection(getItemsWithEqualsLabels(itemsInfo)); listIntervals.putAll(new AlfaLettersListLabelsDetectionAlgorithm1().getItemsIntervals(itemsInfo)); listIntervals.putAll(new AlfaLettersListLabelsDetectionAlgorithm2().getItemsIntervals(itemsInfo)); + listIntervals.putAll(new KoreanLettersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo)); listIntervals.putAll(new RomanNumbersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo)); listIntervals.putAll(new ArabicNumbersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo)); return listIntervals.getSet(); diff --git a/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/KoreanLettersListLabelsDetectionAlgorithm.java b/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/KoreanLettersListLabelsDetectionAlgorithm.java new file mode 100644 index 00000000..4978534c --- /dev/null +++ b/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/KoreanLettersListLabelsDetectionAlgorithm.java @@ -0,0 +1,57 @@ +package org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection; + +import java.util.Arrays; +import java.util.List; + +public class KoreanLettersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm { + + protected static final List letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하'); + + private static final String UPPER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+"; + private static final String LOWER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+"; + private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+"; + + @Override + protected String getRegex() { + return KOREAN_LETTER_REGEX; + } + + @Override + protected String getLowerCaseRegex() { + return LOWER_CASE_KOREAN_LETTER_REGEX; + } + + @Override + protected String getUpperCaseRegex() { + return UPPER_CASE_KOREAN_LETTER_REGEX; + } + + @Override + protected String getStringFromNumber(Integer number) { + return getLettersFromNumber(number); + } + + @Override + protected Integer getNumberFromString(String string) { + return getNumberFromLetters(string); + } + + private static String getLettersFromNumber(int integer) { + integer--; + if (integer < letters.size()) { + return letters.get(integer).toString(); + } + return null; + } + + private static Integer getNumberFromLetters(String s) { + if (s.length() != 1) { + return null; + } + int num = letters.indexOf(s.charAt(0)); + if (num < 0) { + return null; + } + return num + 1; + } +} diff --git a/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/LettersListLabelsDetectionAlgorithm.java b/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/LettersListLabelsDetectionAlgorithm.java index b6f35828..97d56371 100644 --- a/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/LettersListLabelsDetectionAlgorithm.java +++ b/src/main/java/org/verapdf/wcag/algorithms/semanticalgorithms/utils/listLabelsDetection/LettersListLabelsDetectionAlgorithm.java @@ -61,7 +61,7 @@ public Set getItemsIntervals(List itemsInfo) { if (number != null) { number++; String s = getStringFromNumber(number); - if (!item.toUpperCase().startsWith(s, start) || !item.startsWith(prefix) || + if (s == null || !item.toUpperCase().startsWith(s, start) || !item.startsWith(prefix) || isCharMatchRegex(item, start + s.length()) || isBadItem(itemInfo, item, s, start) || ((!item.substring(start, start + s.length()).matches(getLowerCaseRegex()) || isUpperCase) && (!item.substring(start, start + s.length()).matches(getUpperCaseRegex()) || !isUpperCase))) { @@ -104,7 +104,7 @@ public Set getItemsIntervals(List itemsInfo) { continue; } //only Roman??? - if (!substring.toUpperCase().startsWith(getStringFromNumber(number))) { + if (getStringFromNumber(number) == null || !substring.toUpperCase().startsWith(getStringFromNumber(number))) { number = null; continue; }