Basic functionality

This commit is contained in:
Braydon 2024-06-06 21:18:06 -04:00
parent b003ed5837
commit 6db88e8932
10 changed files with 295 additions and 10 deletions

View File

@ -49,6 +49,12 @@
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- MongoDB -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-mongodb</artifactId>
</dependency>
<!-- Libraries -->
<dependency>
<groupId>org.projectlombok</groupId>
@ -56,5 +62,17 @@
<version>1.18.32</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.11.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.12.0</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@ -1,5 +1,6 @@
package me.braydon.profanity;
import com.google.gson.Gson;
import lombok.NonNull;
import lombok.SneakyThrows;
import lombok.extern.log4j.Log4j2;
@ -17,6 +18,8 @@ import java.util.Objects;
@SpringBootApplication
@Log4j2(topic = "TextPurify")
public class TextPurifyAPI {
public static final Gson GSON = new Gson();
@SneakyThrows
public static void main(@NonNull String[] args) {
// Handle loading of our configuration file

View File

@ -1,4 +1,10 @@
package me.braydon.profanity.common;/**
package me.braydon.profanity.common;
/**
* Tags to obtain from content.
*
* @author Braydon
*/public enum ContentTag {
*/
public enum ContentTag {
VULGARITY, ADVERTISEMENT
}

View File

@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NonNull;
import me.braydon.profanity.common.Language;
import org.springframework.data.annotation.Id;
import java.util.List;
import java.util.Map;
@ -16,6 +17,8 @@ import java.util.Map;
*/
@AllArgsConstructor @Getter
public final class ProfanityList {
@Id @NonNull private final String id;
/**
* The links that are whitelisted from the filter.
*/

View File

@ -15,6 +15,12 @@ public final class ContentProcessInput {
*/
private String content;
/**
* The char to use for matched
* replacement operations.
*/
private char replaceChar = '*';
/**
* Check if this input is malformed.
*

View File

@ -3,6 +3,7 @@ package me.braydon.profanity.model.response;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NonNull;
import me.braydon.profanity.common.ContentTag;
import java.util.List;
@ -24,7 +25,7 @@ public final class ContentProcessResponse {
/**
* The tags obtained from the content.
*/
@NonNull private final List<String> tags;
@NonNull private final List<ContentTag> tags;
/**
* The score of the content.

View File

@ -1,4 +1,34 @@
package me.braydon.profanity.processor;/**
package me.braydon.profanity.processor;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NonNull;
import me.braydon.profanity.common.ContentTag;
import me.braydon.profanity.model.ProfanityList;
import java.util.List;
/**
* @author Braydon
*/public final class TextProcessor {
*/
@AllArgsConstructor @Getter
public abstract class TextProcessor {
/**
* The tag that should be applied to content
* if they are processed by this processor.
*/
@NonNull private final ContentTag tag;
/**
* Processor the given content.
*
* @param profanityList the profanity list to use
* @param content the content to process
* @param replacement the replacement content to modify
* @param replaceChar the replace char to use
* @param matched the matched content to add to
* @return the replaced content
*/
@NonNull public abstract StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
@NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched);
}

View File

@ -1,4 +1,129 @@
package me.braydon.profanity.processor.impl;/**
package me.braydon.profanity.processor.impl;
import lombok.NonNull;
import me.braydon.profanity.common.ContentTag;
import me.braydon.profanity.common.Language;
import me.braydon.profanity.model.ProfanityList;
import me.braydon.profanity.processor.TextProcessor;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A text processor to filter vulgar content.
*
* @author Braydon
*/public final class VulgarityProcessor {
*/
public final class VulgarityProcessor extends TextProcessor {
/**
* Patterns for profane words.
*/
private static final Map<String, Pattern> wordPatterns = Collections.synchronizedMap(new HashMap<>());
/**
* Substitutions for characters in profane words.
*/
private static final Map<Character, Character> charSubstitutions = Collections.synchronizedMap(new HashMap<>());
static { // Populate char substitutions
charSubstitutions.put('3', 'e');
charSubstitutions.put('1', 'i');
charSubstitutions.put('!', 'i');
charSubstitutions.put('@', 'a');
charSubstitutions.put('7', 't');
charSubstitutions.put('0', 'o');
charSubstitutions.put('5', 's');
charSubstitutions.put('8', 'b');
charSubstitutions.put('$', 's');
}
public VulgarityProcessor() {
super(ContentTag.VULGARITY);
}
/**
* Processor the given content.
*
* @param profanityList the profanity list to use
* @param content the content to process
* @param replacement the replacement content to modify
* @param replaceChar the replace char to use
* @param matched the matched content to add to
* @return the replaced content
*/
@Override @NonNull
public StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
@NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched) {
// Populate word patterns if empty
if (wordPatterns.isEmpty()) {
populatePatterns(profanityList);
}
content = content.replaceAll("\\p{Punct}", ""); // Replace punctuation
// Process single words in the content
int offset = 0;
for (Map.Entry<String, Pattern> entry : wordPatterns.entrySet()) {
String word = entry.getKey();
Pattern pattern = entry.getValue();
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
matched.add(word);
int start = offset + matcher.start();
int end = offset + matcher.end();
replacement.replace(start, end, Character.toString(replaceChar).repeat(word.length()));
offset += word.length() - (end - start);
}
}
// TODO: Process phrases in the content
return replacement;
}
/**
* Populate the word patterns
* for the given profanity list.
*
* @param profanityList the profanity list to use
*/
private void populatePatterns(@NonNull ProfanityList profanityList) {
for (Map.Entry<Language, List<String>> entry : profanityList.getProfaneWords().entrySet()) {
for (String word : entry.getValue()) {
wordPatterns.put(word, Pattern.compile(buildCombinedRegex(word), Pattern.CASE_INSENSITIVE));
}
}
}
/**
* Build a regex pattern for the given word.
* <p>
* This pattern will match the exact, and
* obfuscated versions of the word.
* </p>
*
* @param word the word to build for
* @return the built regex pattern
*/
@NonNull
private String buildCombinedRegex(@NonNull String word) {
StringBuilder exactWordRegex = new StringBuilder();
StringBuilder obfuscatedWordRegex = new StringBuilder();
for (char character : word.toCharArray()) {
char lowerChar = Character.toLowerCase(character);
exactWordRegex.append(lowerChar);
if (charSubstitutions.containsKey(lowerChar)) {
obfuscatedWordRegex.append('[').append(lowerChar).append(charSubstitutions.get(lowerChar)).append(']');
} else {
obfuscatedWordRegex.append(lowerChar);
}
}
// Build the pattern
return exactWordRegex + ((exactWordRegex.compareTo(obfuscatedWordRegex) == 0) ? "" : "|" + obfuscatedWordRegex);
}
}

View File

@ -1,13 +1,27 @@
package me.braydon.profanity.service;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import jakarta.annotation.PostConstruct;
import lombok.NonNull;
import lombok.SneakyThrows;
import lombok.extern.log4j.Log4j2;
import me.braydon.profanity.TextPurifyAPI;
import me.braydon.profanity.common.ContentTag;
import me.braydon.profanity.common.Language;
import me.braydon.profanity.model.ProfanityList;
import me.braydon.profanity.model.input.ContentProcessInput;
import me.braydon.profanity.model.response.ContentProcessResponse;
import me.braydon.profanity.processor.TextProcessor;
import me.braydon.profanity.processor.impl.VulgarityProcessor;
import me.braydon.profanity.repository.ProfanityListRepository;
import org.apache.commons.text.StringEscapeUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.*;
/**
* This service is responsible
@ -15,17 +29,89 @@ import java.util.ArrayList;
*
* @author Braydon
*/
@Service
@Service @Log4j2(topic = "Filtration Service")
public final class FiltrationService {
/**
* The profanity list repository to use.
*/
@NonNull private final ProfanityListRepository profanityListRepository;
/**
* The profanity list to use.
*/
private ProfanityList profanityList;
/**
* The registered text processors to use.
*/
@NonNull private final List<TextProcessor> textProcessors = Collections.synchronizedList(new ArrayList<>());
@Autowired
public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) {
this.profanityListRepository = profanityListRepository;
// Register text processors
textProcessors.add(new VulgarityProcessor());
}
/**
* Populate the database with
* default lists if empty.
*/
@PostConstruct @SneakyThrows
public void populateDefaults() {
long before = System.currentTimeMillis();
// List is already present
if ((profanityList = profanityListRepository.getProfanityList()) != null) {
log.info("Loaded lists in {}ms", System.currentTimeMillis() - before);
return;
}
// Download the pre-made lists
// for each language and save it.
log.info("Downloading pre-made lists...");
before = System.currentTimeMillis();
Map<Language, List<String>> profaneWords = new HashMap<>();
Map<Language, List<String>> profanePhrases = new HashMap<>();
for (Language lang : Language.values()) {
String contentUrl = "https://raw.githubusercontent.com/Rainnny7/TextPurify/master/lists/" + lang.name().toLowerCase() + ".json";
JsonArray content = TextPurifyAPI.GSON.fromJson(new Scanner(new URL(contentUrl).openStream(),
StandardCharsets.UTF_8
).useDelimiter("\\A").next(), JsonArray.class);
for (JsonElement item : content) {
String element = item.getAsString();
(element.contains(" ") ? profanePhrases : profaneWords).computeIfAbsent(lang, $ -> new ArrayList<>()).add(element);
}
}
profanityList = profanityListRepository.save(new ProfanityList("primary", new ArrayList<>(), profaneWords, profanePhrases));
log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before);
}
@NonNull
public ContentProcessResponse process(@NonNull ContentProcessInput input) {
return new ContentProcessResponse(input.getContent(), new ArrayList<>(), new ArrayList<>(), 0D);
List<String> matched = new ArrayList<>(); // The content that was matched
List<ContentTag> tags = new ArrayList<>(); // Tags obtained from the processed content
StringBuilder replacement = new StringBuilder(input.getContent());
// Handle filtering if a profanity list is present
if (profanityList != null) {
String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
// Invoke each text processor on the content
for (TextProcessor textProcessor : textProcessors) {
int before = matched.size();
replacement = textProcessor.process(profanityList, content, replacement, input.getReplaceChar(), matched);
if (matched.size() > before) {
tags.add(textProcessor.getTag());
}
}
}
// Calculate the score based on
// the matched profane content
double score = 0D;
return new ContentProcessResponse(replacement.toString(), matched, tags, score);
}
}

View File

@ -10,6 +10,13 @@ logging:
# Spring Configuration
spring:
data:
# MongoDB Configuration
mongodb:
uri: "mongodb://textpurify:p4$$w0rd@localhost:27017"
database: "textpurify"
auto-index-creation: true # Automatically create collection indexes
# Don't serialize null values by default with Jackson
jackson:
default-property-inclusion: non_null