From 6db88e893213dfe0d703c34d3862aa6e5d68433b Mon Sep 17 00:00:00 2001 From: Rainnny7 Date: Thu, 6 Jun 2024 21:18:06 -0400 Subject: [PATCH] Basic functionality --- API/pom.xml | 18 +++ .../me/braydon/profanity/TextPurifyAPI.java | 3 + .../braydon/profanity/common/ContentTag.java | 10 +- .../profanity/model/ProfanityList.java | 3 + .../model/input/ContentProcessInput.java | 6 + .../response/ContentProcessResponse.java | 3 +- .../profanity/processor/TextProcessor.java | 34 ++++- .../processor/impl/VulgarityProcessor.java | 129 +++++++++++++++++- .../profanity/service/FiltrationService.java | 92 ++++++++++++- API/src/main/resources/application.yml | 7 + 10 files changed, 295 insertions(+), 10 deletions(-) diff --git a/API/pom.xml b/API/pom.xml index 76ae134..2ffe83c 100644 --- a/API/pom.xml +++ b/API/pom.xml @@ -49,6 +49,12 @@ spring-boot-starter-web + + + org.springframework.boot + spring-boot-starter-data-mongodb + + org.projectlombok @@ -56,5 +62,17 @@ 1.18.32 provided + + com.google.code.gson + gson + 2.11.0 + compile + + + org.apache.commons + commons-text + 1.12.0 + compile + \ No newline at end of file diff --git a/API/src/main/java/me/braydon/profanity/TextPurifyAPI.java b/API/src/main/java/me/braydon/profanity/TextPurifyAPI.java index c20ed3d..c6fa7d7 100644 --- a/API/src/main/java/me/braydon/profanity/TextPurifyAPI.java +++ b/API/src/main/java/me/braydon/profanity/TextPurifyAPI.java @@ -1,5 +1,6 @@ package me.braydon.profanity; +import com.google.gson.Gson; import lombok.NonNull; import lombok.SneakyThrows; import lombok.extern.log4j.Log4j2; @@ -17,6 +18,8 @@ import java.util.Objects; @SpringBootApplication @Log4j2(topic = "TextPurify") public class TextPurifyAPI { + public static final Gson GSON = new Gson(); + @SneakyThrows public static void main(@NonNull String[] args) { // Handle loading of our configuration file diff --git a/API/src/main/java/me/braydon/profanity/common/ContentTag.java b/API/src/main/java/me/braydon/profanity/common/ContentTag.java index a08c378..f025218 100644 --- a/API/src/main/java/me/braydon/profanity/common/ContentTag.java +++ b/API/src/main/java/me/braydon/profanity/common/ContentTag.java @@ -1,4 +1,10 @@ -package me.braydon.profanity.common;/** +package me.braydon.profanity.common; + +/** + * Tags to obtain from content. + * * @author Braydon - */public enum ContentTag { + */ +public enum ContentTag { + VULGARITY, ADVERTISEMENT } \ No newline at end of file diff --git a/API/src/main/java/me/braydon/profanity/model/ProfanityList.java b/API/src/main/java/me/braydon/profanity/model/ProfanityList.java index cb6bc3a..8edbc63 100644 --- a/API/src/main/java/me/braydon/profanity/model/ProfanityList.java +++ b/API/src/main/java/me/braydon/profanity/model/ProfanityList.java @@ -4,6 +4,7 @@ import lombok.AllArgsConstructor; import lombok.Getter; import lombok.NonNull; import me.braydon.profanity.common.Language; +import org.springframework.data.annotation.Id; import java.util.List; import java.util.Map; @@ -16,6 +17,8 @@ import java.util.Map; */ @AllArgsConstructor @Getter public final class ProfanityList { + @Id @NonNull private final String id; + /** * The links that are whitelisted from the filter. */ diff --git a/API/src/main/java/me/braydon/profanity/model/input/ContentProcessInput.java b/API/src/main/java/me/braydon/profanity/model/input/ContentProcessInput.java index 437e66a..9892fff 100644 --- a/API/src/main/java/me/braydon/profanity/model/input/ContentProcessInput.java +++ b/API/src/main/java/me/braydon/profanity/model/input/ContentProcessInput.java @@ -15,6 +15,12 @@ public final class ContentProcessInput { */ private String content; + /** + * The char to use for matched + * replacement operations. + */ + private char replaceChar = '*'; + /** * Check if this input is malformed. * diff --git a/API/src/main/java/me/braydon/profanity/model/response/ContentProcessResponse.java b/API/src/main/java/me/braydon/profanity/model/response/ContentProcessResponse.java index a58e0a4..4ecd94a 100644 --- a/API/src/main/java/me/braydon/profanity/model/response/ContentProcessResponse.java +++ b/API/src/main/java/me/braydon/profanity/model/response/ContentProcessResponse.java @@ -3,6 +3,7 @@ package me.braydon.profanity.model.response; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.NonNull; +import me.braydon.profanity.common.ContentTag; import java.util.List; @@ -24,7 +25,7 @@ public final class ContentProcessResponse { /** * The tags obtained from the content. */ - @NonNull private final List tags; + @NonNull private final List tags; /** * The score of the content. diff --git a/API/src/main/java/me/braydon/profanity/processor/TextProcessor.java b/API/src/main/java/me/braydon/profanity/processor/TextProcessor.java index a93f183..d065301 100644 --- a/API/src/main/java/me/braydon/profanity/processor/TextProcessor.java +++ b/API/src/main/java/me/braydon/profanity/processor/TextProcessor.java @@ -1,4 +1,34 @@ -package me.braydon.profanity.processor;/** +package me.braydon.profanity.processor; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NonNull; +import me.braydon.profanity.common.ContentTag; +import me.braydon.profanity.model.ProfanityList; + +import java.util.List; + +/** * @author Braydon - */public final class TextProcessor { + */ +@AllArgsConstructor @Getter +public abstract class TextProcessor { + /** + * The tag that should be applied to content + * if they are processed by this processor. + */ + @NonNull private final ContentTag tag; + + /** + * Processor the given content. + * + * @param profanityList the profanity list to use + * @param content the content to process + * @param replacement the replacement content to modify + * @param replaceChar the replace char to use + * @param matched the matched content to add to + * @return the replaced content + */ + @NonNull public abstract StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content, + @NonNull StringBuilder replacement, int replaceChar, @NonNull List matched); } \ No newline at end of file diff --git a/API/src/main/java/me/braydon/profanity/processor/impl/VulgarityProcessor.java b/API/src/main/java/me/braydon/profanity/processor/impl/VulgarityProcessor.java index 136acd1..fe4319a 100644 --- a/API/src/main/java/me/braydon/profanity/processor/impl/VulgarityProcessor.java +++ b/API/src/main/java/me/braydon/profanity/processor/impl/VulgarityProcessor.java @@ -1,4 +1,129 @@ -package me.braydon.profanity.processor.impl;/** +package me.braydon.profanity.processor.impl; + +import lombok.NonNull; +import me.braydon.profanity.common.ContentTag; +import me.braydon.profanity.common.Language; +import me.braydon.profanity.model.ProfanityList; +import me.braydon.profanity.processor.TextProcessor; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A text processor to filter vulgar content. + * * @author Braydon - */public final class VulgarityProcessor { + */ +public final class VulgarityProcessor extends TextProcessor { + /** + * Patterns for profane words. + */ + private static final Map wordPatterns = Collections.synchronizedMap(new HashMap<>()); + + /** + * Substitutions for characters in profane words. + */ + private static final Map charSubstitutions = Collections.synchronizedMap(new HashMap<>()); + static { // Populate char substitutions + charSubstitutions.put('3', 'e'); + charSubstitutions.put('1', 'i'); + charSubstitutions.put('!', 'i'); + charSubstitutions.put('@', 'a'); + charSubstitutions.put('7', 't'); + charSubstitutions.put('0', 'o'); + charSubstitutions.put('5', 's'); + charSubstitutions.put('8', 'b'); + charSubstitutions.put('$', 's'); + } + + public VulgarityProcessor() { + super(ContentTag.VULGARITY); + } + + /** + * Processor the given content. + * + * @param profanityList the profanity list to use + * @param content the content to process + * @param replacement the replacement content to modify + * @param replaceChar the replace char to use + * @param matched the matched content to add to + * @return the replaced content + */ + @Override @NonNull + public StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content, + @NonNull StringBuilder replacement, int replaceChar, @NonNull List matched) { + // Populate word patterns if empty + if (wordPatterns.isEmpty()) { + populatePatterns(profanityList); + } + content = content.replaceAll("\\p{Punct}", ""); // Replace punctuation + + // Process single words in the content + int offset = 0; + for (Map.Entry entry : wordPatterns.entrySet()) { + String word = entry.getKey(); + Pattern pattern = entry.getValue(); + Matcher matcher = pattern.matcher(content); + + while (matcher.find()) { + matched.add(word); + int start = offset + matcher.start(); + int end = offset + matcher.end(); + replacement.replace(start, end, Character.toString(replaceChar).repeat(word.length())); + offset += word.length() - (end - start); + } + } + + // TODO: Process phrases in the content + + return replacement; + } + + /** + * Populate the word patterns + * for the given profanity list. + * + * @param profanityList the profanity list to use + */ + private void populatePatterns(@NonNull ProfanityList profanityList) { + for (Map.Entry> entry : profanityList.getProfaneWords().entrySet()) { + for (String word : entry.getValue()) { + wordPatterns.put(word, Pattern.compile(buildCombinedRegex(word), Pattern.CASE_INSENSITIVE)); + } + } + } + + /** + * Build a regex pattern for the given word. + *

+ * This pattern will match the exact, and + * obfuscated versions of the word. + *

+ * + * @param word the word to build for + * @return the built regex pattern + */ + @NonNull + private String buildCombinedRegex(@NonNull String word) { + StringBuilder exactWordRegex = new StringBuilder(); + StringBuilder obfuscatedWordRegex = new StringBuilder(); + + for (char character : word.toCharArray()) { + char lowerChar = Character.toLowerCase(character); + exactWordRegex.append(lowerChar); + if (charSubstitutions.containsKey(lowerChar)) { + obfuscatedWordRegex.append('[').append(lowerChar).append(charSubstitutions.get(lowerChar)).append(']'); + } else { + obfuscatedWordRegex.append(lowerChar); + } + } + + // Build the pattern + return exactWordRegex + ((exactWordRegex.compareTo(obfuscatedWordRegex) == 0) ? "" : "|" + obfuscatedWordRegex); + } } \ No newline at end of file diff --git a/API/src/main/java/me/braydon/profanity/service/FiltrationService.java b/API/src/main/java/me/braydon/profanity/service/FiltrationService.java index b4a11fd..6e55826 100644 --- a/API/src/main/java/me/braydon/profanity/service/FiltrationService.java +++ b/API/src/main/java/me/braydon/profanity/service/FiltrationService.java @@ -1,13 +1,27 @@ package me.braydon.profanity.service; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import jakarta.annotation.PostConstruct; import lombok.NonNull; +import lombok.SneakyThrows; +import lombok.extern.log4j.Log4j2; +import me.braydon.profanity.TextPurifyAPI; +import me.braydon.profanity.common.ContentTag; +import me.braydon.profanity.common.Language; +import me.braydon.profanity.model.ProfanityList; import me.braydon.profanity.model.input.ContentProcessInput; import me.braydon.profanity.model.response.ContentProcessResponse; +import me.braydon.profanity.processor.TextProcessor; +import me.braydon.profanity.processor.impl.VulgarityProcessor; import me.braydon.profanity.repository.ProfanityListRepository; +import org.apache.commons.text.StringEscapeUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import java.util.ArrayList; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.*; /** * This service is responsible @@ -15,17 +29,89 @@ import java.util.ArrayList; * * @author Braydon */ -@Service +@Service @Log4j2(topic = "Filtration Service") public final class FiltrationService { + /** + * The profanity list repository to use. + */ @NonNull private final ProfanityListRepository profanityListRepository; + /** + * The profanity list to use. + */ + private ProfanityList profanityList; + + /** + * The registered text processors to use. + */ + @NonNull private final List textProcessors = Collections.synchronizedList(new ArrayList<>()); + @Autowired public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) { this.profanityListRepository = profanityListRepository; + + // Register text processors + textProcessors.add(new VulgarityProcessor()); + } + + /** + * Populate the database with + * default lists if empty. + */ + @PostConstruct @SneakyThrows + public void populateDefaults() { + long before = System.currentTimeMillis(); + + // List is already present + if ((profanityList = profanityListRepository.getProfanityList()) != null) { + log.info("Loaded lists in {}ms", System.currentTimeMillis() - before); + return; + } + // Download the pre-made lists + // for each language and save it. + log.info("Downloading pre-made lists..."); + before = System.currentTimeMillis(); + + Map> profaneWords = new HashMap<>(); + Map> profanePhrases = new HashMap<>(); + for (Language lang : Language.values()) { + String contentUrl = "https://raw.githubusercontent.com/Rainnny7/TextPurify/master/lists/" + lang.name().toLowerCase() + ".json"; + JsonArray content = TextPurifyAPI.GSON.fromJson(new Scanner(new URL(contentUrl).openStream(), + StandardCharsets.UTF_8 + ).useDelimiter("\\A").next(), JsonArray.class); + for (JsonElement item : content) { + String element = item.getAsString(); + (element.contains(" ") ? profanePhrases : profaneWords).computeIfAbsent(lang, $ -> new ArrayList<>()).add(element); + } + } + profanityList = profanityListRepository.save(new ProfanityList("primary", new ArrayList<>(), profaneWords, profanePhrases)); + log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before); } @NonNull public ContentProcessResponse process(@NonNull ContentProcessInput input) { - return new ContentProcessResponse(input.getContent(), new ArrayList<>(), new ArrayList<>(), 0D); + List matched = new ArrayList<>(); // The content that was matched + List tags = new ArrayList<>(); // Tags obtained from the processed content + StringBuilder replacement = new StringBuilder(input.getContent()); + + // Handle filtering if a profanity list is present + if (profanityList != null) { + String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter + + // Invoke each text processor on the content + for (TextProcessor textProcessor : textProcessors) { + int before = matched.size(); + replacement = textProcessor.process(profanityList, content, replacement, input.getReplaceChar(), matched); + if (matched.size() > before) { + tags.add(textProcessor.getTag()); + } + } + } + + // Calculate the score based on + // the matched profane content + double score = 0D; + + return new ContentProcessResponse(replacement.toString(), matched, tags, score); } } \ No newline at end of file diff --git a/API/src/main/resources/application.yml b/API/src/main/resources/application.yml index 263ff6d..38c7d58 100644 --- a/API/src/main/resources/application.yml +++ b/API/src/main/resources/application.yml @@ -10,6 +10,13 @@ logging: # Spring Configuration spring: + data: + # MongoDB Configuration + mongodb: + uri: "mongodb://textpurify:p4$$w0rd@localhost:27017" + database: "textpurify" + auto-index-creation: true # Automatically create collection indexes + # Don't serialize null values by default with Jackson jackson: default-property-inclusion: non_null