Basic functionality
This commit is contained in:
parent
b003ed5837
commit
6db88e8932
18
API/pom.xml
18
API/pom.xml
@ -49,6 +49,12 @@
|
||||
<artifactId>spring-boot-starter-web</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- MongoDB -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-data-mongodb</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Libraries -->
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
@ -56,5 +62,17 @@
|
||||
<version>1.18.32</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
<version>2.11.0</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-text</artifactId>
|
||||
<version>1.12.0</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
@ -1,5 +1,6 @@
|
||||
package me.braydon.profanity;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import lombok.NonNull;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
@ -17,6 +18,8 @@ import java.util.Objects;
|
||||
@SpringBootApplication
|
||||
@Log4j2(topic = "TextPurify")
|
||||
public class TextPurifyAPI {
|
||||
public static final Gson GSON = new Gson();
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(@NonNull String[] args) {
|
||||
// Handle loading of our configuration file
|
||||
|
@ -1,4 +1,10 @@
|
||||
package me.braydon.profanity.common;/**
|
||||
package me.braydon.profanity.common;
|
||||
|
||||
/**
|
||||
* Tags to obtain from content.
|
||||
*
|
||||
* @author Braydon
|
||||
*/public enum ContentTag {
|
||||
*/
|
||||
public enum ContentTag {
|
||||
VULGARITY, ADVERTISEMENT
|
||||
}
|
@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import me.braydon.profanity.common.Language;
|
||||
import org.springframework.data.annotation.Id;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -16,6 +17,8 @@ import java.util.Map;
|
||||
*/
|
||||
@AllArgsConstructor @Getter
|
||||
public final class ProfanityList {
|
||||
@Id @NonNull private final String id;
|
||||
|
||||
/**
|
||||
* The links that are whitelisted from the filter.
|
||||
*/
|
||||
|
@ -15,6 +15,12 @@ public final class ContentProcessInput {
|
||||
*/
|
||||
private String content;
|
||||
|
||||
/**
|
||||
* The char to use for matched
|
||||
* replacement operations.
|
||||
*/
|
||||
private char replaceChar = '*';
|
||||
|
||||
/**
|
||||
* Check if this input is malformed.
|
||||
*
|
||||
|
@ -3,6 +3,7 @@ package me.braydon.profanity.model.response;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import me.braydon.profanity.common.ContentTag;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -24,7 +25,7 @@ public final class ContentProcessResponse {
|
||||
/**
|
||||
* The tags obtained from the content.
|
||||
*/
|
||||
@NonNull private final List<String> tags;
|
||||
@NonNull private final List<ContentTag> tags;
|
||||
|
||||
/**
|
||||
* The score of the content.
|
||||
|
@ -1,4 +1,34 @@
|
||||
package me.braydon.profanity.processor;/**
|
||||
package me.braydon.profanity.processor;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import me.braydon.profanity.common.ContentTag;
|
||||
import me.braydon.profanity.model.ProfanityList;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author Braydon
|
||||
*/public final class TextProcessor {
|
||||
*/
|
||||
@AllArgsConstructor @Getter
|
||||
public abstract class TextProcessor {
|
||||
/**
|
||||
* The tag that should be applied to content
|
||||
* if they are processed by this processor.
|
||||
*/
|
||||
@NonNull private final ContentTag tag;
|
||||
|
||||
/**
|
||||
* Processor the given content.
|
||||
*
|
||||
* @param profanityList the profanity list to use
|
||||
* @param content the content to process
|
||||
* @param replacement the replacement content to modify
|
||||
* @param replaceChar the replace char to use
|
||||
* @param matched the matched content to add to
|
||||
* @return the replaced content
|
||||
*/
|
||||
@NonNull public abstract StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
|
||||
@NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched);
|
||||
}
|
@ -1,4 +1,129 @@
|
||||
package me.braydon.profanity.processor.impl;/**
|
||||
package me.braydon.profanity.processor.impl;
|
||||
|
||||
import lombok.NonNull;
|
||||
import me.braydon.profanity.common.ContentTag;
|
||||
import me.braydon.profanity.common.Language;
|
||||
import me.braydon.profanity.model.ProfanityList;
|
||||
import me.braydon.profanity.processor.TextProcessor;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A text processor to filter vulgar content.
|
||||
*
|
||||
* @author Braydon
|
||||
*/public final class VulgarityProcessor {
|
||||
*/
|
||||
public final class VulgarityProcessor extends TextProcessor {
|
||||
/**
|
||||
* Patterns for profane words.
|
||||
*/
|
||||
private static final Map<String, Pattern> wordPatterns = Collections.synchronizedMap(new HashMap<>());
|
||||
|
||||
/**
|
||||
* Substitutions for characters in profane words.
|
||||
*/
|
||||
private static final Map<Character, Character> charSubstitutions = Collections.synchronizedMap(new HashMap<>());
|
||||
static { // Populate char substitutions
|
||||
charSubstitutions.put('3', 'e');
|
||||
charSubstitutions.put('1', 'i');
|
||||
charSubstitutions.put('!', 'i');
|
||||
charSubstitutions.put('@', 'a');
|
||||
charSubstitutions.put('7', 't');
|
||||
charSubstitutions.put('0', 'o');
|
||||
charSubstitutions.put('5', 's');
|
||||
charSubstitutions.put('8', 'b');
|
||||
charSubstitutions.put('$', 's');
|
||||
}
|
||||
|
||||
public VulgarityProcessor() {
|
||||
super(ContentTag.VULGARITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Processor the given content.
|
||||
*
|
||||
* @param profanityList the profanity list to use
|
||||
* @param content the content to process
|
||||
* @param replacement the replacement content to modify
|
||||
* @param replaceChar the replace char to use
|
||||
* @param matched the matched content to add to
|
||||
* @return the replaced content
|
||||
*/
|
||||
@Override @NonNull
|
||||
public StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
|
||||
@NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched) {
|
||||
// Populate word patterns if empty
|
||||
if (wordPatterns.isEmpty()) {
|
||||
populatePatterns(profanityList);
|
||||
}
|
||||
content = content.replaceAll("\\p{Punct}", ""); // Replace punctuation
|
||||
|
||||
// Process single words in the content
|
||||
int offset = 0;
|
||||
for (Map.Entry<String, Pattern> entry : wordPatterns.entrySet()) {
|
||||
String word = entry.getKey();
|
||||
Pattern pattern = entry.getValue();
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
|
||||
while (matcher.find()) {
|
||||
matched.add(word);
|
||||
int start = offset + matcher.start();
|
||||
int end = offset + matcher.end();
|
||||
replacement.replace(start, end, Character.toString(replaceChar).repeat(word.length()));
|
||||
offset += word.length() - (end - start);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Process phrases in the content
|
||||
|
||||
return replacement;
|
||||
}
|
||||
|
||||
/**
|
||||
* Populate the word patterns
|
||||
* for the given profanity list.
|
||||
*
|
||||
* @param profanityList the profanity list to use
|
||||
*/
|
||||
private void populatePatterns(@NonNull ProfanityList profanityList) {
|
||||
for (Map.Entry<Language, List<String>> entry : profanityList.getProfaneWords().entrySet()) {
|
||||
for (String word : entry.getValue()) {
|
||||
wordPatterns.put(word, Pattern.compile(buildCombinedRegex(word), Pattern.CASE_INSENSITIVE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a regex pattern for the given word.
|
||||
* <p>
|
||||
* This pattern will match the exact, and
|
||||
* obfuscated versions of the word.
|
||||
* </p>
|
||||
*
|
||||
* @param word the word to build for
|
||||
* @return the built regex pattern
|
||||
*/
|
||||
@NonNull
|
||||
private String buildCombinedRegex(@NonNull String word) {
|
||||
StringBuilder exactWordRegex = new StringBuilder();
|
||||
StringBuilder obfuscatedWordRegex = new StringBuilder();
|
||||
|
||||
for (char character : word.toCharArray()) {
|
||||
char lowerChar = Character.toLowerCase(character);
|
||||
exactWordRegex.append(lowerChar);
|
||||
if (charSubstitutions.containsKey(lowerChar)) {
|
||||
obfuscatedWordRegex.append('[').append(lowerChar).append(charSubstitutions.get(lowerChar)).append(']');
|
||||
} else {
|
||||
obfuscatedWordRegex.append(lowerChar);
|
||||
}
|
||||
}
|
||||
|
||||
// Build the pattern
|
||||
return exactWordRegex + ((exactWordRegex.compareTo(obfuscatedWordRegex) == 0) ? "" : "|" + obfuscatedWordRegex);
|
||||
}
|
||||
}
|
@ -1,13 +1,27 @@
|
||||
package me.braydon.profanity.service;
|
||||
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonElement;
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.NonNull;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import me.braydon.profanity.TextPurifyAPI;
|
||||
import me.braydon.profanity.common.ContentTag;
|
||||
import me.braydon.profanity.common.Language;
|
||||
import me.braydon.profanity.model.ProfanityList;
|
||||
import me.braydon.profanity.model.input.ContentProcessInput;
|
||||
import me.braydon.profanity.model.response.ContentProcessResponse;
|
||||
import me.braydon.profanity.processor.TextProcessor;
|
||||
import me.braydon.profanity.processor.impl.VulgarityProcessor;
|
||||
import me.braydon.profanity.repository.ProfanityListRepository;
|
||||
import org.apache.commons.text.StringEscapeUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This service is responsible
|
||||
@ -15,17 +29,89 @@ import java.util.ArrayList;
|
||||
*
|
||||
* @author Braydon
|
||||
*/
|
||||
@Service
|
||||
@Service @Log4j2(topic = "Filtration Service")
|
||||
public final class FiltrationService {
|
||||
/**
|
||||
* The profanity list repository to use.
|
||||
*/
|
||||
@NonNull private final ProfanityListRepository profanityListRepository;
|
||||
|
||||
/**
|
||||
* The profanity list to use.
|
||||
*/
|
||||
private ProfanityList profanityList;
|
||||
|
||||
/**
|
||||
* The registered text processors to use.
|
||||
*/
|
||||
@NonNull private final List<TextProcessor> textProcessors = Collections.synchronizedList(new ArrayList<>());
|
||||
|
||||
@Autowired
|
||||
public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) {
|
||||
this.profanityListRepository = profanityListRepository;
|
||||
|
||||
// Register text processors
|
||||
textProcessors.add(new VulgarityProcessor());
|
||||
}
|
||||
|
||||
/**
|
||||
* Populate the database with
|
||||
* default lists if empty.
|
||||
*/
|
||||
@PostConstruct @SneakyThrows
|
||||
public void populateDefaults() {
|
||||
long before = System.currentTimeMillis();
|
||||
|
||||
// List is already present
|
||||
if ((profanityList = profanityListRepository.getProfanityList()) != null) {
|
||||
log.info("Loaded lists in {}ms", System.currentTimeMillis() - before);
|
||||
return;
|
||||
}
|
||||
// Download the pre-made lists
|
||||
// for each language and save it.
|
||||
log.info("Downloading pre-made lists...");
|
||||
before = System.currentTimeMillis();
|
||||
|
||||
Map<Language, List<String>> profaneWords = new HashMap<>();
|
||||
Map<Language, List<String>> profanePhrases = new HashMap<>();
|
||||
for (Language lang : Language.values()) {
|
||||
String contentUrl = "https://raw.githubusercontent.com/Rainnny7/TextPurify/master/lists/" + lang.name().toLowerCase() + ".json";
|
||||
JsonArray content = TextPurifyAPI.GSON.fromJson(new Scanner(new URL(contentUrl).openStream(),
|
||||
StandardCharsets.UTF_8
|
||||
).useDelimiter("\\A").next(), JsonArray.class);
|
||||
for (JsonElement item : content) {
|
||||
String element = item.getAsString();
|
||||
(element.contains(" ") ? profanePhrases : profaneWords).computeIfAbsent(lang, $ -> new ArrayList<>()).add(element);
|
||||
}
|
||||
}
|
||||
profanityList = profanityListRepository.save(new ProfanityList("primary", new ArrayList<>(), profaneWords, profanePhrases));
|
||||
log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before);
|
||||
}
|
||||
|
||||
@NonNull
|
||||
public ContentProcessResponse process(@NonNull ContentProcessInput input) {
|
||||
return new ContentProcessResponse(input.getContent(), new ArrayList<>(), new ArrayList<>(), 0D);
|
||||
List<String> matched = new ArrayList<>(); // The content that was matched
|
||||
List<ContentTag> tags = new ArrayList<>(); // Tags obtained from the processed content
|
||||
StringBuilder replacement = new StringBuilder(input.getContent());
|
||||
|
||||
// Handle filtering if a profanity list is present
|
||||
if (profanityList != null) {
|
||||
String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
|
||||
|
||||
// Invoke each text processor on the content
|
||||
for (TextProcessor textProcessor : textProcessors) {
|
||||
int before = matched.size();
|
||||
replacement = textProcessor.process(profanityList, content, replacement, input.getReplaceChar(), matched);
|
||||
if (matched.size() > before) {
|
||||
tags.add(textProcessor.getTag());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the score based on
|
||||
// the matched profane content
|
||||
double score = 0D;
|
||||
|
||||
return new ContentProcessResponse(replacement.toString(), matched, tags, score);
|
||||
}
|
||||
}
|
@ -10,6 +10,13 @@ logging:
|
||||
|
||||
# Spring Configuration
|
||||
spring:
|
||||
data:
|
||||
# MongoDB Configuration
|
||||
mongodb:
|
||||
uri: "mongodb://textpurify:p4$$w0rd@localhost:27017"
|
||||
database: "textpurify"
|
||||
auto-index-creation: true # Automatically create collection indexes
|
||||
|
||||
# Don't serialize null values by default with Jackson
|
||||
jackson:
|
||||
default-property-inclusion: non_null
|
||||
|
Loading…
x
Reference in New Issue
Block a user