Basic functionality

2024-06-06 21:18:06 -04:00 · 2024-06-06 21:18:06 -04:00 · 6db88e8932
commit 6db88e8932
parent b003ed5837
10 changed files with 295 additions and 10 deletions
--- a/API/pom.xml
+++ b/API/pom.xml
@ -49,6 +49,12 @@
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

+        <!-- MongoDB -->
+        <dependency>
+            <groupId>org.springframework.boot</groupId>
+            <artifactId>spring-boot-starter-data-mongodb</artifactId>
+        </dependency>
+
        <!-- Libraries -->
        <dependency>
            <groupId>org.projectlombok</groupId>
@ -56,5 +62,17 @@
            <version>1.18.32</version>
            <scope>provided</scope>
        </dependency>
+        <dependency>
+            <groupId>com.google.code.gson</groupId>
+            <artifactId>gson</artifactId>
+            <version>2.11.0</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>1.12.0</version>
+            <scope>compile</scope>
+        </dependency>
    </dependencies>
 </project>
--- a/API/src/main/java/me/braydon/profanity/TextPurifyAPI.java
+++ b/API/src/main/java/me/braydon/profanity/TextPurifyAPI.java
@ -1,5 +1,6 @@
 package me.braydon.profanity;

+import com.google.gson.Gson;
 import lombok.NonNull;
 import lombok.SneakyThrows;
 import lombok.extern.log4j.Log4j2;
@ -17,6 +18,8 @@ import java.util.Objects;
@SpringBootApplication
@Log4j2(topic = "TextPurify")
 public class TextPurifyAPI {
+    public static final Gson GSON = new Gson();
+
    @SneakyThrows
    public static void main(@NonNull String[] args) {
        // Handle loading of our configuration file
--- a/API/src/main/java/me/braydon/profanity/common/ContentTag.java
+++ b/API/src/main/java/me/braydon/profanity/common/ContentTag.java
@ -1,4 +1,10 @@
-package me.braydon.profanity.common;/**
+package me.braydon.profanity.common;
+
+/**
+ * Tags to obtain from content.
+ *
 * @author Braydon
- */public enum ContentTag {
+ */
+public enum ContentTag {
+    VULGARITY, ADVERTISEMENT
 }
--- a/API/src/main/java/me/braydon/profanity/model/ProfanityList.java
+++ b/API/src/main/java/me/braydon/profanity/model/ProfanityList.java
@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
 import lombok.Getter;
 import lombok.NonNull;
 import me.braydon.profanity.common.Language;
+import org.springframework.data.annotation.Id;

 import java.util.List;
 import java.util.Map;
@ -16,6 +17,8 @@ import java.util.Map;
 */
@AllArgsConstructor @Getter
 public final class ProfanityList {
+    @Id @NonNull private final String id;
+
    /**
     * The links that are whitelisted from the filter.
     */
--- a/API/src/main/java/me/braydon/profanity/model/input/ContentProcessInput.java
+++ b/API/src/main/java/me/braydon/profanity/model/input/ContentProcessInput.java
@ -15,6 +15,12 @@ public final class ContentProcessInput {
     */
    private String content;

+    /**
+     * The char to use for matched
+     * replacement operations.
+     */
+    private char replaceChar = '*';
+
    /**
     * Check if this input is malformed.
     *
--- a/API/src/main/java/me/braydon/profanity/model/response/ContentProcessResponse.java
+++ b/API/src/main/java/me/braydon/profanity/model/response/ContentProcessResponse.java
@ -3,6 +3,7 @@ package me.braydon.profanity.model.response;
 import lombok.AllArgsConstructor;
 import lombok.Getter;
 import lombok.NonNull;
+import me.braydon.profanity.common.ContentTag;

 import java.util.List;

@ -24,7 +25,7 @@ public final class ContentProcessResponse {
    /**
     * The tags obtained from the content.
     */
-    @NonNull private final List<String> tags;
+    @NonNull private final List<ContentTag> tags;

    /**
     * The score of the content.
--- a/API/src/main/java/me/braydon/profanity/processor/TextProcessor.java
+++ b/API/src/main/java/me/braydon/profanity/processor/TextProcessor.java
@ -1,4 +1,34 @@
-package me.braydon.profanity.processor;/**
+package me.braydon.profanity.processor;
+
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.NonNull;
+import me.braydon.profanity.common.ContentTag;
+import me.braydon.profanity.model.ProfanityList;
+
+import java.util.List;
+
+/**
 * @author Braydon
- */public final class TextProcessor {
+ */
+@AllArgsConstructor @Getter
+public abstract class TextProcessor {
+    /**
+     * The tag that should be applied to content
+     * if they are processed by this processor.
+     */
+    @NonNull private final ContentTag tag;
+
+    /**
+     * Processor the given content.
+     *
+     * @param profanityList the profanity list to use
+     * @param content the content to process
+     * @param replacement the replacement content to modify
+     * @param replaceChar the replace char to use
+     * @param matched the matched content to add to
+     * @return the replaced content
+     */
+    @NonNull public abstract StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
+                                            @NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched);
 }
--- a/API/src/main/java/me/braydon/profanity/processor/impl/VulgarityProcessor.java
+++ b/API/src/main/java/me/braydon/profanity/processor/impl/VulgarityProcessor.java
@ -1,4 +1,129 @@
-package me.braydon.profanity.processor.impl;/**
+package me.braydon.profanity.processor.impl;
+
+import lombok.NonNull;
+import me.braydon.profanity.common.ContentTag;
+import me.braydon.profanity.common.Language;
+import me.braydon.profanity.model.ProfanityList;
+import me.braydon.profanity.processor.TextProcessor;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A text processor to filter vulgar content.
+ *
 * @author Braydon
- */public final class VulgarityProcessor {
+ */
+public final class VulgarityProcessor extends TextProcessor {
+    /**
+     * Patterns for profane words.
+     */
+    private static final Map<String, Pattern> wordPatterns = Collections.synchronizedMap(new HashMap<>());
+
+    /**
+     * Substitutions for characters in profane words.
+     */
+    private static final Map<Character, Character> charSubstitutions = Collections.synchronizedMap(new HashMap<>());
+    static { // Populate char substitutions
+        charSubstitutions.put('3', 'e');
+        charSubstitutions.put('1', 'i');
+        charSubstitutions.put('!', 'i');
+        charSubstitutions.put('@', 'a');
+        charSubstitutions.put('7', 't');
+        charSubstitutions.put('0', 'o');
+        charSubstitutions.put('5', 's');
+        charSubstitutions.put('8', 'b');
+        charSubstitutions.put('$', 's');
+    }
+
+    public VulgarityProcessor() {
+        super(ContentTag.VULGARITY);
+    }
+
+    /**
+     * Processor the given content.
+     *
+     * @param profanityList the profanity list to use
+     * @param content       the content to process
+     * @param replacement   the replacement content to modify
+     * @param replaceChar   the replace char to use
+     * @param matched       the matched content to add to
+     * @return the replaced content
+     */
+    @Override @NonNull
+    public StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
+                                 @NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched) {
+        // Populate word patterns if empty
+        if (wordPatterns.isEmpty()) {
+            populatePatterns(profanityList);
+        }
+        content = content.replaceAll("\\p{Punct}", ""); // Replace punctuation
+
+        // Process single words in the content
+        int offset = 0;
+        for (Map.Entry<String, Pattern> entry : wordPatterns.entrySet()) {
+            String word = entry.getKey();
+            Pattern pattern = entry.getValue();
+            Matcher matcher = pattern.matcher(content);
+
+            while (matcher.find()) {
+                matched.add(word);
+                int start = offset + matcher.start();
+                int end = offset + matcher.end();
+                replacement.replace(start, end, Character.toString(replaceChar).repeat(word.length()));
+                offset += word.length() - (end - start);
+            }
+        }
+
+        // TODO: Process phrases in the content
+
+        return replacement;
+    }
+
+    /**
+     * Populate the word patterns
+     * for the given profanity list.
+     *
+     * @param profanityList the profanity list to use
+     */
+    private void populatePatterns(@NonNull ProfanityList profanityList) {
+        for (Map.Entry<Language, List<String>> entry : profanityList.getProfaneWords().entrySet()) {
+            for (String word : entry.getValue()) {
+                wordPatterns.put(word, Pattern.compile(buildCombinedRegex(word), Pattern.CASE_INSENSITIVE));
+            }
+        }
+    }
+
+    /**
+     * Build a regex pattern for the given word.
+     * <p>
+     * This pattern will match the exact, and
+     * obfuscated versions of the word.
+     * </p>
+     *
+     * @param word the word to build for
+     * @return the built regex pattern
+     */
+    @NonNull
+    private String buildCombinedRegex(@NonNull String word) {
+        StringBuilder exactWordRegex = new StringBuilder();
+        StringBuilder obfuscatedWordRegex = new StringBuilder();
+
+        for (char character : word.toCharArray()) {
+            char lowerChar = Character.toLowerCase(character);
+            exactWordRegex.append(lowerChar);
+            if (charSubstitutions.containsKey(lowerChar)) {
+                obfuscatedWordRegex.append('[').append(lowerChar).append(charSubstitutions.get(lowerChar)).append(']');
+            } else {
+                obfuscatedWordRegex.append(lowerChar);
+            }
+        }
+
+        // Build the pattern
+        return exactWordRegex + ((exactWordRegex.compareTo(obfuscatedWordRegex) == 0) ? "" : "|" + obfuscatedWordRegex);
+    }
 }
--- a/API/src/main/java/me/braydon/profanity/service/FiltrationService.java
+++ b/API/src/main/java/me/braydon/profanity/service/FiltrationService.java
@ -1,13 +1,27 @@
 package me.braydon.profanity.service;

+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import jakarta.annotation.PostConstruct;
 import lombok.NonNull;
+import lombok.SneakyThrows;
+import lombok.extern.log4j.Log4j2;
+import me.braydon.profanity.TextPurifyAPI;
+import me.braydon.profanity.common.ContentTag;
+import me.braydon.profanity.common.Language;
+import me.braydon.profanity.model.ProfanityList;
 import me.braydon.profanity.model.input.ContentProcessInput;
 import me.braydon.profanity.model.response.ContentProcessResponse;
+import me.braydon.profanity.processor.TextProcessor;
+import me.braydon.profanity.processor.impl.VulgarityProcessor;
 import me.braydon.profanity.repository.ProfanityListRepository;
+import org.apache.commons.text.StringEscapeUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Service;

-import java.util.ArrayList;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.*;

 /**
 * This service is responsible
@ -15,17 +29,89 @@ import java.util.ArrayList;
 *
 * @author Braydon
 */
-@Service
+@Service @Log4j2(topic = "Filtration Service")
 public final class FiltrationService {
+    /**
+     * The profanity list repository to use.
+     */
    @NonNull private final ProfanityListRepository profanityListRepository;

+    /**
+     * The profanity list to use.
+     */
+    private ProfanityList profanityList;
+
+    /**
+     * The registered text processors to use.
+     */
+    @NonNull private final List<TextProcessor> textProcessors = Collections.synchronizedList(new ArrayList<>());
+
    @Autowired
    public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) {
        this.profanityListRepository = profanityListRepository;
+
+        // Register text processors
+        textProcessors.add(new VulgarityProcessor());
+    }
+
+    /**
+     * Populate the database with
+     * default lists if empty.
+     */
+    @PostConstruct @SneakyThrows
+    public void populateDefaults() {
+        long before = System.currentTimeMillis();
+
+        // List is already present
+        if ((profanityList = profanityListRepository.getProfanityList()) != null) {
+            log.info("Loaded lists in {}ms", System.currentTimeMillis() - before);
+            return;
+        }
+        // Download the pre-made lists
+        // for each language and save it.
+        log.info("Downloading pre-made lists...");
+        before = System.currentTimeMillis();
+
+        Map<Language, List<String>> profaneWords = new HashMap<>();
+        Map<Language, List<String>> profanePhrases = new HashMap<>();
+        for (Language lang : Language.values()) {
+            String contentUrl = "https://raw.githubusercontent.com/Rainnny7/TextPurify/master/lists/" + lang.name().toLowerCase() + ".json";
+            JsonArray content = TextPurifyAPI.GSON.fromJson(new Scanner(new URL(contentUrl).openStream(),
+                    StandardCharsets.UTF_8
+            ).useDelimiter("\\A").next(), JsonArray.class);
+            for (JsonElement item : content) {
+                String element = item.getAsString();
+                (element.contains(" ") ? profanePhrases : profaneWords).computeIfAbsent(lang, $ -> new ArrayList<>()).add(element);
+            }
+        }
+        profanityList = profanityListRepository.save(new ProfanityList("primary", new ArrayList<>(), profaneWords, profanePhrases));
+        log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before);
    }

    @NonNull
    public ContentProcessResponse process(@NonNull ContentProcessInput input) {
-        return new ContentProcessResponse(input.getContent(), new ArrayList<>(), new ArrayList<>(), 0D);
+        List<String> matched = new ArrayList<>(); // The content that was matched
+        List<ContentTag> tags = new ArrayList<>(); // Tags obtained from the processed content
+        StringBuilder replacement = new StringBuilder(input.getContent());
+
+        // Handle filtering if a profanity list is present
+        if (profanityList != null) {
+            String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
+
+            // Invoke each text processor on the content
+            for (TextProcessor textProcessor : textProcessors) {
+                int before = matched.size();
+                replacement = textProcessor.process(profanityList, content, replacement, input.getReplaceChar(), matched);
+                if (matched.size() > before) {
+                    tags.add(textProcessor.getTag());
+                }
+            }
+        }
+
+        // Calculate the score based on
+        // the matched profane content
+        double score = 0D;
+
+        return new ContentProcessResponse(replacement.toString(), matched, tags, score);
    }
 }
--- a/API/src/main/resources/application.yml
+++ b/API/src/main/resources/application.yml
@ -10,6 +10,13 @@ logging:

 # Spring Configuration
 spring:
+  data:
+    # MongoDB Configuration
+    mongodb:
+      uri: "mongodb://textpurify:p4$$w0rd@localhost:27017"
+      database: "textpurify"
+      auto-index-creation: true # Automatically create collection indexes
+
  # Don't serialize null values by default with Jackson
  jackson:
    default-property-inclusion: non_null