Skip to content

Commit

Permalink
Improve list detection
Browse files Browse the repository at this point in the history
  • Loading branch information
MaximPlusov committed Jul 26, 2024
1 parent d20ce32 commit ea01bf5
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,6 @@ private List<TableBorderBuilder> findTableBorders(Integer pageNumber) {
i++;
}
}
for (TableBorderBuilder border : tableBorders) {
for (LineChunk lineChunk : border.getVerticalLines()) {
StaticContainers.getLinesCollection().getVerticalLines(pageNumber).remove(lineChunk);
}
for (LineChunk lineChunk : border.getHorizontalLines()) {
StaticContainers.getLinesCollection().getHorizontalLines(pageNumber).remove(lineChunk);
}
}
for (int i = 0; i < tableBorders.size();) {
TableBorderBuilder border = tableBorders.get(i);
if ((border.getHorizontalLinesNumber() <= 2 && border.getVerticalLinesNumber() <= 1) ||
Expand All @@ -95,6 +87,10 @@ private List<TableBorderBuilder> findTableBorders(Integer pageNumber) {
i++;
}
}
for (TableBorderBuilder border : tableBorders) {
StaticContainers.getLinesCollection().getVerticalLines(pageNumber).removeAll(border.getVerticalLines());
StaticContainers.getLinesCollection().getHorizontalLines(pageNumber).removeAll(border.getHorizontalLines());
}
return tableBorders;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class ListLabelsUtils {

private static final Set<Character> labels = new HashSet<>(
Arrays.asList('\u002D', '\u2022', '\u25CF', '\u2714', '\u2717', '\u2794', '\u27A2', '\uE00A', '\uE00C',
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※)
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B', '\u274D')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※, ❍)
private static final Character o = '\u006F';

public static boolean isListLabel(String value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,9 @@ protected String getLowerCaseRegex() {
protected String getUpperCaseRegex() {
return UPPER_CASE_ENGLISH_LETTER_REGEX;
}

@Override
protected List<Character> getLetters() {
return letters;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ protected Integer getNumberFromString(String string) {
return getNumberFromLetters1(string);
}

private static String getLetters1FromNumber(int integer) {
private String getLetters1FromNumber(int integer) {
integer--;
int n = integer / letters.size();
char c = letters.get(integer % letters.size());
int n = integer / getLetters().size();
char c = getLetters().get(integer % getLetters().size());
StringBuilder str = new StringBuilder();
for (int i = 0; i <= n; i++) {
str.append(c);
}
return str.toString();
}

private static Integer getNumberFromLetters1(String s) {
private Integer getNumberFromLetters1(String s) {
if (s.isEmpty()) {
return null;
}
int num = letters.indexOf(s.charAt(0));
int num = getLetters().indexOf(s.charAt(0));
if (num < 0) {
return null;
}
Expand All @@ -36,6 +36,6 @@ private static Integer getNumberFromLetters1(String s) {
return null;
}
}
return letters.size() * (s.length() - 1) + num + 1;
return getLetters().size() * (s.length() - 1) + num + 1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,28 @@ protected Integer getNumberFromString(String string) {
return getNumberFromLetters2(string);
}

private static String getLetters2FromNumber(int integer) {
private String getLetters2FromNumber(int integer) {
StringBuilder str = new StringBuilder();
while (integer > 0) {
integer--;
int k = integer % letters.size();
str.insert(0, letters.get(k));
integer /= letters.size();
int k = integer % getLetters().size();
str.insert(0, getLetters().get(k));
integer /= getLetters().size();
}
return str.toString();
}

private static Integer getNumberFromLetters2(String s) {
private Integer getNumberFromLetters2(String s) {
if (s.isEmpty()) {
return null;
}
int result = 0;
for (char c : s.toCharArray()) {
int num = letters.indexOf(c);
int num = getLetters().indexOf(c);
if (num < 0) {
return null;
}
result = result * letters.size() + num + 1;
result = result * getLetters().size() + num + 1;
}
return result;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection;

import java.util.Arrays;
import java.util.List;

public class CircledArabicNumbersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm {

protected static final List<Character> letters = Arrays.asList(
'\u24EA','\u2460','\u2461','\u2462','\u2463','\u2464','\u2465','\u2466','\u2467','\u2468',
'\u2469','\u246A','\u246B','\u246C','\u246D','\u246E','\u246F','\u2470','\u2471','\u2472',
'\u2473','\u3251','\u3252','\u3253','\u3254','\u3255','\u3256','\u3257','\u3258','\u3259',
'\u325A','\u325B','\u325C','\u325D','\u325E','\u325F','\u32B1','\u32B2','\u32B3','\u32B4',
'\u32B5','\u32B6','\u32B7','\u32B8','\u32B9','\u32BA','\u32BB','\u32BC','\u32BD','\u32BE',
'\u32BF');

private static final String KOREAN_LETTER_REGEX = "[\u24EA\u2460-\u2473\u3251-\u325F\u32B1-\u32BF]+";
private static final String UPPER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;

@Override
protected String getRegex() {
return KOREAN_LETTER_REGEX;
}

@Override
protected String getLowerCaseRegex() {
return LOWER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected List<Character> getLetters() {
return letters;
}

@Override
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@

public class KoreanLettersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm {

protected static final List<Character> letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하');
protected static final List<Character> letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차',
'카', '타', '파', '하', '거', '너', '더', '러', '머', '버', '서', '어', '저', '처', '커', '터', '퍼', '허');

private static final String UPPER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하거너더러머버서어저처커터퍼허]+";
private static final String UPPER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;

@Override
protected String getRegex() {
Expand All @@ -22,36 +23,12 @@ protected String getLowerCaseRegex() {
}

@Override
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected String getStringFromNumber(Integer number) {
return getLettersFromNumber(number);
protected List<Character> getLetters() {
return letters;
}

@Override
protected Integer getNumberFromString(String string) {
return getNumberFromLetters(string);
}

private static String getLettersFromNumber(int integer) {
integer--;
if (integer < letters.size()) {
return letters.get(integer).toString();
}
return null;
}

private static Integer getNumberFromLetters(String s) {
if (s.length() != 1) {
return null;
}
int num = letters.indexOf(s.charAt(0));
if (num < 0) {
return null;
}
return num + 1;
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ private boolean isBadItem(ListItemTextInfo listItem, String item, String s, int
}

protected abstract String getLowerCaseRegex();

protected List<Character> getLetters() {
return null;
}

protected abstract String getUpperCaseRegex();

Expand All @@ -136,4 +140,33 @@ private static boolean isCharMatchRegex(String s, int index, String regex) {
}
return s.substring(index, index + 1).matches(regex);
}

@Override
protected String getStringFromNumber(Integer number) {
return getLettersFromNumber(number);
}

@Override
protected Integer getNumberFromString(String string) {
return getNumberFromLetters(string);
}

private String getLettersFromNumber(int integer) {
integer--;
if (integer < getLetters().size()) {
return getLetters().get(integer).toString();
}
return null;
}

private Integer getNumberFromLetters(String s) {
if (s.length() != 1) {
return null;
}
int num = getLetters().indexOf(s.charAt(0));
if (num < 0) {
return null;
}
return num + 1;
}
}

0 comments on commit ea01bf5

Please sign in to comment.