diff --git a/multi_rake/utils.py b/multi_rake/utils.py index 1ad8079..24cb94e 100644 --- a/multi_rake/utils.py +++ b/multi_rake/utils.py @@ -11,6 +11,20 @@ def detect_language(text, proba_threshold): + """Detect language code and probability of input text based on 'cld2'. + + Parameters + ---------- + text : utf8Bytes + Text to detect language as unicode. + proba_threshold : float + Minimum probability cld2 language detection has to output in order to accept proposed language code. + + Returns + ------- + str + Language code detected by cld2. + """ _, _, details = cld2.detect(text) language_code = details[0].language_code @@ -20,11 +34,35 @@ def detect_language(text, proba_threshold): return language_code -def keep_only_letters(string): - return ' '.join(token.group() for token in LETTERS_RE.finditer(string)) +def keep_only_letters(text): + """Apply regex to only keep letters. + + Parameters + ---------- + text : str + Text to search for letters in. + + Returns + ------- + str + Input text cleaned by regex to only contain letters. + """ + return ' '.join(token.group() for token in LETTERS_RE.finditer(text)) def separate_words(text): + """Seperate text to tokens by whitespace and dimiss numeric tokens. + + Parameters + ---------- + text : str + Text to tokenize. + + Returns + ------- + list of str + Tokenized text. + """ words = [] for word in text.split(): @@ -35,5 +73,17 @@ def separate_words(text): def split_sentences(text): + """Split text into sentences with custom regex boundaries. + + Parameters + ---------- + text : str + Text to split on sentence delimiters. + + Returns + ------- + list of str + Text split into sentences. + """ sentences = SENTENCE_DELIMITERS_RE.split(text) return sentences