Source code for util.formatting.words

import re
from enum import Enum

import nltk
import stanza
from langdetect import detect
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")
# noinspection PyArgumentEqualDefault
stanza.download("en", verbose=False)
stanza.download("de", verbose=False)
print("Downloads finished")

MEANINGLESS_WORDS = ["dear", "name", "best", "regards"]

MEANINGLESS_COMBINED = MEANINGLESS_WORDS + stopwords.words("english") + stopwords.words("german")


[docs] class Language(Enum): EN = "en" DE = "de"
[docs] class TextProcessor: _pipelines = {} _nltk_lang_map = {Language.EN: "english", Language.DE: "german"}
[docs] def __init__(self, language: Language): if language not in self._nltk_lang_map: raise ValueError("Unsupported language") self.language = language self.stopwords = set(stopwords.words(self._nltk_lang_map[language])) if language not in self._pipelines: self._pipelines[language] = stanza.Pipeline(language.value, processors="tokenize,pos,lemma", verbose=False) self.pipeline = self._pipelines[language]
[docs] def process(self, text: str) -> list: doc = self.pipeline(text) return [ word.lemma.lower() for sentence in doc.sentences for word in sentence.words if word.upos in ("NOUN", "ADJ") and word.lemma.lower() not in self.stopwords ]
[docs] def remove_unnecessary_words(text: str) -> list[str]: lang_str = detect(text) try: lang = Language(lang_str) except ValueError: lang = Language.EN processor = TextProcessor(lang) return processor.process(text)
[docs] def count_words(text: str) -> int: return len(re.findall(r'\b\w+\b', text))
[docs] def remove_duplicate_words(words: list[str]) -> list[str]: unique_words = [] seen_unique_words = set() for word in words: if word not in seen_unique_words: unique_words.append(word) seen_unique_words.add(word) return unique_words
[docs] def get_strongly_cleaned_text(text: str, max_words: int = 10 ** 10) -> str: strongly_cleaned = remove_unnecessary_words(text) unique_words = remove_duplicate_words(strongly_cleaned) return " ".join(unique_words[:max_words])
[docs] def get_cleaned_text(text: str, max_words: int = 10 ** 10) -> str: tokens = text.split(" ") filtered = [word for word in tokens if word.lower() not in MEANINGLESS_COMBINED] return " ".join(filtered[:max_words])