Skip to content

Commit

Permalink
Improve list detection. Support korean numeration
Browse files Browse the repository at this point in the history
  • Loading branch information
MaximPlusov committed Jul 12, 2024
1 parent 35011f1 commit e2f7eed
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,15 @@
import org.verapdf.wcag.algorithms.entities.lists.ListIntervalsCollection;
import org.verapdf.wcag.algorithms.entities.lists.info.ListItemInfo;
import org.verapdf.wcag.algorithms.entities.lists.info.ListItemTextInfo;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.AlfaLettersListLabelsDetectionAlgorithm1;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.AlfaLettersListLabelsDetectionAlgorithm2;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.ArabicNumbersListLabelsDetectionAlgorithm;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.RomanNumbersListLabelsDetectionAlgorithm;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.*;

import java.util.*;

public class ListLabelsUtils {

private static final Set<Character> labels = new HashSet<>(
Arrays.asList('\u002D', '\u2022', '\u25CF', '\u2714', '\u2717', '\u2794', '\u27A2', '\uE00A', '\uE00C',
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼)
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※)
private static final Character o = '\u006F';

public static boolean isListLabel(String value) {
Expand Down Expand Up @@ -70,6 +67,7 @@ public static boolean isListLabels(List<String> listLabels) {
}
return new RomanNumbersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) ||
new ArabicNumbersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) ||
new KoreanLettersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) ||
new AlfaLettersListLabelsDetectionAlgorithm1().isListLabels(labels, commonStartLength, commonEndLength) ||
new AlfaLettersListLabelsDetectionAlgorithm2().isListLabels(labels, commonStartLength, commonEndLength);
}
Expand Down Expand Up @@ -142,6 +140,7 @@ public static Set<ListInterval> getListItemsIntervals(List<ListItemTextInfo> ite
ListIntervalsCollection listIntervals = new ListIntervalsCollection(getItemsWithEqualsLabels(itemsInfo));
listIntervals.putAll(new AlfaLettersListLabelsDetectionAlgorithm1().getItemsIntervals(itemsInfo));
listIntervals.putAll(new AlfaLettersListLabelsDetectionAlgorithm2().getItemsIntervals(itemsInfo));
listIntervals.putAll(new KoreanLettersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo));
listIntervals.putAll(new RomanNumbersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo));
listIntervals.putAll(new ArabicNumbersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo));
return listIntervals.getSet();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection;

import java.util.Arrays;
import java.util.List;

public class KoreanLettersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm {

protected static final List<Character> letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하');

private static final String UPPER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";

@Override
protected String getRegex() {
return KOREAN_LETTER_REGEX;
}

@Override
protected String getLowerCaseRegex() {
return LOWER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected String getStringFromNumber(Integer number) {
return getLettersFromNumber(number);
}

@Override
protected Integer getNumberFromString(String string) {
return getNumberFromLetters(string);
}

private static String getLettersFromNumber(int integer) {
integer--;
if (integer < letters.size()) {
return letters.get(integer).toString();
}
return null;
}

private static Integer getNumberFromLetters(String s) {
if (s.length() != 1) {
return null;
}
int num = letters.indexOf(s.charAt(0));
if (num < 0) {
return null;
}
return num + 1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public Set<ListInterval> getItemsIntervals(List<ListItemTextInfo> itemsInfo) {
if (number != null) {
number++;
String s = getStringFromNumber(number);
if (!item.toUpperCase().startsWith(s, start) || !item.startsWith(prefix) ||
if (s == null || !item.toUpperCase().startsWith(s, start) || !item.startsWith(prefix) ||
isCharMatchRegex(item, start + s.length()) || isBadItem(itemInfo, item, s, start) ||
((!item.substring(start, start + s.length()).matches(getLowerCaseRegex()) || isUpperCase) &&
(!item.substring(start, start + s.length()).matches(getUpperCaseRegex()) || !isUpperCase))) {
Expand Down Expand Up @@ -104,7 +104,7 @@ public Set<ListInterval> getItemsIntervals(List<ListItemTextInfo> itemsInfo) {
continue;
}
//only Roman???
if (!substring.toUpperCase().startsWith(getStringFromNumber(number))) {
if (getStringFromNumber(number) == null || !substring.toUpperCase().startsWith(getStringFromNumber(number))) {
number = null;
continue;
}
Expand Down

0 comments on commit e2f7eed

Please sign in to comment.