import re
from enum import Enum
import nltk
import stanza
from langdetect import detect
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
# noinspection PyArgumentEqualDefault
stanza.download("en", verbose=False)
stanza.download("de", verbose=False)
print("Downloads finished")
MEANINGLESS_WORDS = ["dear", "name", "best", "regards"]
MEANINGLESS_COMBINED = MEANINGLESS_WORDS + stopwords.words("english") + stopwords.words("german")
[docs]
class Language(Enum):
EN = "en"
DE = "de"
[docs]
class TextProcessor:
_pipelines = {}
_nltk_lang_map = {Language.EN: "english", Language.DE: "german"}
[docs]
def __init__(self, language: Language):
if language not in self._nltk_lang_map:
raise ValueError("Unsupported language")
self.language = language
self.stopwords = set(stopwords.words(self._nltk_lang_map[language]))
if language not in self._pipelines:
self._pipelines[language] = stanza.Pipeline(language.value, processors="tokenize,pos,lemma", verbose=False)
self.pipeline = self._pipelines[language]
[docs]
def process(self, text: str) -> list:
doc = self.pipeline(text)
return [
word.lemma.lower()
for sentence in doc.sentences
for word in sentence.words
if word.upos in ("NOUN", "ADJ") and word.lemma.lower() not in self.stopwords
]
[docs]
def remove_unnecessary_words(text: str) -> list[str]:
lang_str = detect(text)
try:
lang = Language(lang_str)
except ValueError:
lang = Language.EN
processor = TextProcessor(lang)
return processor.process(text)
[docs]
def count_words(text: str) -> int:
return len(re.findall(r'\b\w+\b', text))
[docs]
def remove_duplicate_words(words: list[str]) -> list[str]:
unique_words = []
seen_unique_words = set()
for word in words:
if word not in seen_unique_words:
unique_words.append(word)
seen_unique_words.add(word)
return unique_words
[docs]
def get_strongly_cleaned_text(text: str, max_words: int = 10 ** 10) -> str:
strongly_cleaned = remove_unnecessary_words(text)
unique_words = remove_duplicate_words(strongly_cleaned)
return " ".join(unique_words[:max_words])
[docs]
def get_cleaned_text(text: str, max_words: int = 10 ** 10) -> str:
tokens = text.split(" ")
filtered = [word for word in tokens if word.lower() not in MEANINGLESS_COMBINED]
return " ".join(filtered[:max_words])