Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve list detection #333

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,6 @@ private List<TableBorderBuilder> findTableBorders(Integer pageNumber) {
i++;
}
}
for (TableBorderBuilder border : tableBorders) {
for (LineChunk lineChunk : border.getVerticalLines()) {
StaticContainers.getLinesCollection().getVerticalLines(pageNumber).remove(lineChunk);
}
for (LineChunk lineChunk : border.getHorizontalLines()) {
StaticContainers.getLinesCollection().getHorizontalLines(pageNumber).remove(lineChunk);
}
}
for (int i = 0; i < tableBorders.size();) {
TableBorderBuilder border = tableBorders.get(i);
if ((border.getHorizontalLinesNumber() <= 2 && border.getVerticalLinesNumber() <= 1) ||
Expand All @@ -95,6 +87,10 @@ private List<TableBorderBuilder> findTableBorders(Integer pageNumber) {
i++;
}
}
for (TableBorderBuilder border : tableBorders) {
StaticContainers.getLinesCollection().getVerticalLines(pageNumber).removeAll(border.getVerticalLines());
StaticContainers.getLinesCollection().getHorizontalLines(pageNumber).removeAll(border.getHorizontalLines());
}
return tableBorders;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class ListLabelsUtils {

private static final Set<Character> labels = new HashSet<>(
Arrays.asList('\u002D', '\u2022', '\u25CF', '\u2714', '\u2717', '\u2794', '\u27A2', '\uE00A', '\uE00C',
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※)
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B', '\u274D')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※, ❍)
private static final Character o = '\u006F';

public static boolean isListLabel(String value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,9 @@ protected String getLowerCaseRegex() {
protected String getUpperCaseRegex() {
return UPPER_CASE_ENGLISH_LETTER_REGEX;
}

@Override
protected List<Character> getLetters() {
return letters;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ protected Integer getNumberFromString(String string) {
return getNumberFromLetters1(string);
}

private static String getLetters1FromNumber(int integer) {
private String getLetters1FromNumber(int integer) {
integer--;
int n = integer / letters.size();
char c = letters.get(integer % letters.size());
int n = integer / getLetters().size();
char c = getLetters().get(integer % getLetters().size());
StringBuilder str = new StringBuilder();
for (int i = 0; i <= n; i++) {
str.append(c);
}
return str.toString();
}

private static Integer getNumberFromLetters1(String s) {
private Integer getNumberFromLetters1(String s) {
if (s.isEmpty()) {
return null;
}
int num = letters.indexOf(s.charAt(0));
int num = getLetters().indexOf(s.charAt(0));
if (num < 0) {
return null;
}
Expand All @@ -36,6 +36,6 @@ private static Integer getNumberFromLetters1(String s) {
return null;
}
}
return letters.size() * (s.length() - 1) + num + 1;
return getLetters().size() * (s.length() - 1) + num + 1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,28 @@ protected Integer getNumberFromString(String string) {
return getNumberFromLetters2(string);
}

private static String getLetters2FromNumber(int integer) {
private String getLetters2FromNumber(int integer) {
StringBuilder str = new StringBuilder();
while (integer > 0) {
integer--;
int k = integer % letters.size();
str.insert(0, letters.get(k));
integer /= letters.size();
int k = integer % getLetters().size();
str.insert(0, getLetters().get(k));
integer /= getLetters().size();
}
return str.toString();
}

private static Integer getNumberFromLetters2(String s) {
private Integer getNumberFromLetters2(String s) {
if (s.isEmpty()) {
return null;
}
int result = 0;
for (char c : s.toCharArray()) {
int num = letters.indexOf(c);
int num = getLetters().indexOf(c);
if (num < 0) {
return null;
}
result = result * letters.size() + num + 1;
result = result * getLetters().size() + num + 1;
}
return result;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection;

import java.util.Arrays;
import java.util.List;

public class CircledArabicNumbersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm {

protected static final List<Character> letters = Arrays.asList(
'\u24EA','\u2460','\u2461','\u2462','\u2463','\u2464','\u2465','\u2466','\u2467','\u2468',
'\u2469','\u246A','\u246B','\u246C','\u246D','\u246E','\u246F','\u2470','\u2471','\u2472',
'\u2473','\u3251','\u3252','\u3253','\u3254','\u3255','\u3256','\u3257','\u3258','\u3259',
'\u325A','\u325B','\u325C','\u325D','\u325E','\u325F','\u32B1','\u32B2','\u32B3','\u32B4',
'\u32B5','\u32B6','\u32B7','\u32B8','\u32B9','\u32BA','\u32BB','\u32BC','\u32BD','\u32BE',
'\u32BF');

private static final String KOREAN_LETTER_REGEX = "[\u24EA\u2460-\u2473\u3251-\u325F\u32B1-\u32BF]+";
private static final String UPPER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;

@Override
protected String getRegex() {
return KOREAN_LETTER_REGEX;
}

@Override
protected String getLowerCaseRegex() {
return LOWER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected List<Character> getLetters() {
return letters;
}

@Override
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@

public class KoreanLettersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm {

protected static final List<Character> letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하');
protected static final List<Character> letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차',
'카', '타', '파', '하', '거', '너', '더', '러', '머', '버', '서', '어', '저', '처', '커', '터', '퍼', '허');

private static final String UPPER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하거너더러머버서어저처커터퍼허]+";
private static final String UPPER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = KOREAN_LETTER_REGEX;

@Override
protected String getRegex() {
Expand All @@ -22,36 +23,12 @@ protected String getLowerCaseRegex() {
}

@Override
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected String getStringFromNumber(Integer number) {
return getLettersFromNumber(number);
protected List<Character> getLetters() {
return letters;
}

@Override
protected Integer getNumberFromString(String string) {
return getNumberFromLetters(string);
}

private static String getLettersFromNumber(int integer) {
integer--;
if (integer < letters.size()) {
return letters.get(integer).toString();
}
return null;
}

private static Integer getNumberFromLetters(String s) {
if (s.length() != 1) {
return null;
}
int num = letters.indexOf(s.charAt(0));
if (num < 0) {
return null;
}
return num + 1;
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ private boolean isBadItem(ListItemTextInfo listItem, String item, String s, int
}

protected abstract String getLowerCaseRegex();

protected List<Character> getLetters() {
return null;
}

protected abstract String getUpperCaseRegex();

Expand All @@ -136,4 +140,33 @@ private static boolean isCharMatchRegex(String s, int index, String regex) {
}
return s.substring(index, index + 1).matches(regex);
}

@Override
protected String getStringFromNumber(Integer number) {
return getLettersFromNumber(number);
}

@Override
protected Integer getNumberFromString(String string) {
return getNumberFromLetters(string);
}

private String getLettersFromNumber(int integer) {
integer--;
if (integer < getLetters().size()) {
return getLetters().get(integer).toString();
}
return null;
}

private Integer getNumberFromLetters(String s) {
if (s.length() != 1) {
return null;
}
int num = getLetters().indexOf(s.charAt(0));
if (num < 0) {
return null;
}
return num + 1;
}
}
Loading