> entry : profanityList.getProfaneWords().entrySet()) {
+ for (String word : entry.getValue()) {
+ wordPatterns.put(word, Pattern.compile(buildCombinedRegex(word), Pattern.CASE_INSENSITIVE));
+ }
+ }
+ }
+
+ /**
+ * Build a regex pattern for the given word.
+ *
+ * This pattern will match the exact, and
+ * obfuscated versions of the word.
+ *
+ *
+ * @param word the word to build for
+ * @return the built regex pattern
+ */
+ @NonNull
+ private String buildCombinedRegex(@NonNull String word) {
+ StringBuilder exactWordRegex = new StringBuilder();
+ StringBuilder obfuscatedWordRegex = new StringBuilder();
+
+ for (char character : word.toCharArray()) {
+ char lowerChar = Character.toLowerCase(character);
+ exactWordRegex.append(lowerChar);
+ if (charSubstitutions.containsKey(lowerChar)) {
+ obfuscatedWordRegex.append('[').append(lowerChar).append(charSubstitutions.get(lowerChar)).append(']');
+ } else {
+ obfuscatedWordRegex.append(lowerChar);
+ }
+ }
+
+ // Build the pattern
+ return exactWordRegex + ((exactWordRegex.compareTo(obfuscatedWordRegex) == 0) ? "" : "|" + obfuscatedWordRegex);
+ }
}
\ No newline at end of file
diff --git a/API/src/main/java/me/braydon/profanity/service/FiltrationService.java b/API/src/main/java/me/braydon/profanity/service/FiltrationService.java
index b4a11fd..6e55826 100644
--- a/API/src/main/java/me/braydon/profanity/service/FiltrationService.java
+++ b/API/src/main/java/me/braydon/profanity/service/FiltrationService.java
@@ -1,13 +1,27 @@
package me.braydon.profanity.service;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import jakarta.annotation.PostConstruct;
import lombok.NonNull;
+import lombok.SneakyThrows;
+import lombok.extern.log4j.Log4j2;
+import me.braydon.profanity.TextPurifyAPI;
+import me.braydon.profanity.common.ContentTag;
+import me.braydon.profanity.common.Language;
+import me.braydon.profanity.model.ProfanityList;
import me.braydon.profanity.model.input.ContentProcessInput;
import me.braydon.profanity.model.response.ContentProcessResponse;
+import me.braydon.profanity.processor.TextProcessor;
+import me.braydon.profanity.processor.impl.VulgarityProcessor;
import me.braydon.profanity.repository.ProfanityListRepository;
+import org.apache.commons.text.StringEscapeUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
-import java.util.ArrayList;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
/**
* This service is responsible
@@ -15,17 +29,89 @@ import java.util.ArrayList;
*
* @author Braydon
*/
-@Service
+@Service @Log4j2(topic = "Filtration Service")
public final class FiltrationService {
+ /**
+ * The profanity list repository to use.
+ */
@NonNull private final ProfanityListRepository profanityListRepository;
+ /**
+ * The profanity list to use.
+ */
+ private ProfanityList profanityList;
+
+ /**
+ * The registered text processors to use.
+ */
+ @NonNull private final List textProcessors = Collections.synchronizedList(new ArrayList<>());
+
@Autowired
public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) {
this.profanityListRepository = profanityListRepository;
+
+ // Register text processors
+ textProcessors.add(new VulgarityProcessor());
+ }
+
+ /**
+ * Populate the database with
+ * default lists if empty.
+ */
+ @PostConstruct @SneakyThrows
+ public void populateDefaults() {
+ long before = System.currentTimeMillis();
+
+ // List is already present
+ if ((profanityList = profanityListRepository.getProfanityList()) != null) {
+ log.info("Loaded lists in {}ms", System.currentTimeMillis() - before);
+ return;
+ }
+ // Download the pre-made lists
+ // for each language and save it.
+ log.info("Downloading pre-made lists...");
+ before = System.currentTimeMillis();
+
+ Map> profaneWords = new HashMap<>();
+ Map> profanePhrases = new HashMap<>();
+ for (Language lang : Language.values()) {
+ String contentUrl = "https://raw.githubusercontent.com/Rainnny7/TextPurify/master/lists/" + lang.name().toLowerCase() + ".json";
+ JsonArray content = TextPurifyAPI.GSON.fromJson(new Scanner(new URL(contentUrl).openStream(),
+ StandardCharsets.UTF_8
+ ).useDelimiter("\\A").next(), JsonArray.class);
+ for (JsonElement item : content) {
+ String element = item.getAsString();
+ (element.contains(" ") ? profanePhrases : profaneWords).computeIfAbsent(lang, $ -> new ArrayList<>()).add(element);
+ }
+ }
+ profanityList = profanityListRepository.save(new ProfanityList("primary", new ArrayList<>(), profaneWords, profanePhrases));
+ log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before);
}
@NonNull
public ContentProcessResponse process(@NonNull ContentProcessInput input) {
- return new ContentProcessResponse(input.getContent(), new ArrayList<>(), new ArrayList<>(), 0D);
+ List matched = new ArrayList<>(); // The content that was matched
+ List tags = new ArrayList<>(); // Tags obtained from the processed content
+ StringBuilder replacement = new StringBuilder(input.getContent());
+
+ // Handle filtering if a profanity list is present
+ if (profanityList != null) {
+ String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
+
+ // Invoke each text processor on the content
+ for (TextProcessor textProcessor : textProcessors) {
+ int before = matched.size();
+ replacement = textProcessor.process(profanityList, content, replacement, input.getReplaceChar(), matched);
+ if (matched.size() > before) {
+ tags.add(textProcessor.getTag());
+ }
+ }
+ }
+
+ // Calculate the score based on
+ // the matched profane content
+ double score = 0D;
+
+ return new ContentProcessResponse(replacement.toString(), matched, tags, score);
}
}
\ No newline at end of file
diff --git a/API/src/main/resources/application.yml b/API/src/main/resources/application.yml
index 263ff6d..38c7d58 100644
--- a/API/src/main/resources/application.yml
+++ b/API/src/main/resources/application.yml
@@ -10,6 +10,13 @@ logging:
# Spring Configuration
spring:
+ data:
+ # MongoDB Configuration
+ mongodb:
+ uri: "mongodb://textpurify:p4$$w0rd@localhost:27017"
+ database: "textpurify"
+ auto-index-creation: true # Automatically create collection indexes
+
# Don't serialize null values by default with Jackson
jackson:
default-property-inclusion: non_null