Basic functionality
This commit is contained in:
parent
b003ed5837
commit
6db88e8932
18
API/pom.xml
18
API/pom.xml
@ -49,6 +49,12 @@
|
|||||||
<artifactId>spring-boot-starter-web</artifactId>
|
<artifactId>spring-boot-starter-web</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<!-- MongoDB -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-data-mongodb</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Libraries -->
|
<!-- Libraries -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.projectlombok</groupId>
|
<groupId>org.projectlombok</groupId>
|
||||||
@ -56,5 +62,17 @@
|
|||||||
<version>1.18.32</version>
|
<version>1.18.32</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.code.gson</groupId>
|
||||||
|
<artifactId>gson</artifactId>
|
||||||
|
<version>2.11.0</version>
|
||||||
|
<scope>compile</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-text</artifactId>
|
||||||
|
<version>1.12.0</version>
|
||||||
|
<scope>compile</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
@ -1,5 +1,6 @@
|
|||||||
package me.braydon.profanity;
|
package me.braydon.profanity;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.log4j.Log4j2;
|
import lombok.extern.log4j.Log4j2;
|
||||||
@ -17,6 +18,8 @@ import java.util.Objects;
|
|||||||
@SpringBootApplication
|
@SpringBootApplication
|
||||||
@Log4j2(topic = "TextPurify")
|
@Log4j2(topic = "TextPurify")
|
||||||
public class TextPurifyAPI {
|
public class TextPurifyAPI {
|
||||||
|
public static final Gson GSON = new Gson();
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void main(@NonNull String[] args) {
|
public static void main(@NonNull String[] args) {
|
||||||
// Handle loading of our configuration file
|
// Handle loading of our configuration file
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
package me.braydon.profanity.common;/**
|
package me.braydon.profanity.common;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tags to obtain from content.
|
||||||
|
*
|
||||||
* @author Braydon
|
* @author Braydon
|
||||||
*/public enum ContentTag {
|
*/
|
||||||
|
public enum ContentTag {
|
||||||
|
VULGARITY, ADVERTISEMENT
|
||||||
}
|
}
|
@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
|
|||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import me.braydon.profanity.common.Language;
|
import me.braydon.profanity.common.Language;
|
||||||
|
import org.springframework.data.annotation.Id;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -16,6 +17,8 @@ import java.util.Map;
|
|||||||
*/
|
*/
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
public final class ProfanityList {
|
public final class ProfanityList {
|
||||||
|
@Id @NonNull private final String id;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The links that are whitelisted from the filter.
|
* The links that are whitelisted from the filter.
|
||||||
*/
|
*/
|
||||||
|
@ -15,6 +15,12 @@ public final class ContentProcessInput {
|
|||||||
*/
|
*/
|
||||||
private String content;
|
private String content;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The char to use for matched
|
||||||
|
* replacement operations.
|
||||||
|
*/
|
||||||
|
private char replaceChar = '*';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if this input is malformed.
|
* Check if this input is malformed.
|
||||||
*
|
*
|
||||||
|
@ -3,6 +3,7 @@ package me.braydon.profanity.model.response;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
|
import me.braydon.profanity.common.ContentTag;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -24,7 +25,7 @@ public final class ContentProcessResponse {
|
|||||||
/**
|
/**
|
||||||
* The tags obtained from the content.
|
* The tags obtained from the content.
|
||||||
*/
|
*/
|
||||||
@NonNull private final List<String> tags;
|
@NonNull private final List<ContentTag> tags;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The score of the content.
|
* The score of the content.
|
||||||
|
@ -1,4 +1,34 @@
|
|||||||
package me.braydon.profanity.processor;/**
|
package me.braydon.profanity.processor;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NonNull;
|
||||||
|
import me.braydon.profanity.common.ContentTag;
|
||||||
|
import me.braydon.profanity.model.ProfanityList;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
* @author Braydon
|
* @author Braydon
|
||||||
*/public final class TextProcessor {
|
*/
|
||||||
|
@AllArgsConstructor @Getter
|
||||||
|
public abstract class TextProcessor {
|
||||||
|
/**
|
||||||
|
* The tag that should be applied to content
|
||||||
|
* if they are processed by this processor.
|
||||||
|
*/
|
||||||
|
@NonNull private final ContentTag tag;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processor the given content.
|
||||||
|
*
|
||||||
|
* @param profanityList the profanity list to use
|
||||||
|
* @param content the content to process
|
||||||
|
* @param replacement the replacement content to modify
|
||||||
|
* @param replaceChar the replace char to use
|
||||||
|
* @param matched the matched content to add to
|
||||||
|
* @return the replaced content
|
||||||
|
*/
|
||||||
|
@NonNull public abstract StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
|
||||||
|
@NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched);
|
||||||
}
|
}
|
@ -1,4 +1,129 @@
|
|||||||
package me.braydon.profanity.processor.impl;/**
|
package me.braydon.profanity.processor.impl;
|
||||||
|
|
||||||
|
import lombok.NonNull;
|
||||||
|
import me.braydon.profanity.common.ContentTag;
|
||||||
|
import me.braydon.profanity.common.Language;
|
||||||
|
import me.braydon.profanity.model.ProfanityList;
|
||||||
|
import me.braydon.profanity.processor.TextProcessor;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A text processor to filter vulgar content.
|
||||||
|
*
|
||||||
* @author Braydon
|
* @author Braydon
|
||||||
*/public final class VulgarityProcessor {
|
*/
|
||||||
|
public final class VulgarityProcessor extends TextProcessor {
|
||||||
|
/**
|
||||||
|
* Patterns for profane words.
|
||||||
|
*/
|
||||||
|
private static final Map<String, Pattern> wordPatterns = Collections.synchronizedMap(new HashMap<>());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Substitutions for characters in profane words.
|
||||||
|
*/
|
||||||
|
private static final Map<Character, Character> charSubstitutions = Collections.synchronizedMap(new HashMap<>());
|
||||||
|
static { // Populate char substitutions
|
||||||
|
charSubstitutions.put('3', 'e');
|
||||||
|
charSubstitutions.put('1', 'i');
|
||||||
|
charSubstitutions.put('!', 'i');
|
||||||
|
charSubstitutions.put('@', 'a');
|
||||||
|
charSubstitutions.put('7', 't');
|
||||||
|
charSubstitutions.put('0', 'o');
|
||||||
|
charSubstitutions.put('5', 's');
|
||||||
|
charSubstitutions.put('8', 'b');
|
||||||
|
charSubstitutions.put('$', 's');
|
||||||
|
}
|
||||||
|
|
||||||
|
public VulgarityProcessor() {
|
||||||
|
super(ContentTag.VULGARITY);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processor the given content.
|
||||||
|
*
|
||||||
|
* @param profanityList the profanity list to use
|
||||||
|
* @param content the content to process
|
||||||
|
* @param replacement the replacement content to modify
|
||||||
|
* @param replaceChar the replace char to use
|
||||||
|
* @param matched the matched content to add to
|
||||||
|
* @return the replaced content
|
||||||
|
*/
|
||||||
|
@Override @NonNull
|
||||||
|
public StringBuilder process(@NonNull ProfanityList profanityList, @NonNull String content,
|
||||||
|
@NonNull StringBuilder replacement, int replaceChar, @NonNull List<String> matched) {
|
||||||
|
// Populate word patterns if empty
|
||||||
|
if (wordPatterns.isEmpty()) {
|
||||||
|
populatePatterns(profanityList);
|
||||||
|
}
|
||||||
|
content = content.replaceAll("\\p{Punct}", ""); // Replace punctuation
|
||||||
|
|
||||||
|
// Process single words in the content
|
||||||
|
int offset = 0;
|
||||||
|
for (Map.Entry<String, Pattern> entry : wordPatterns.entrySet()) {
|
||||||
|
String word = entry.getKey();
|
||||||
|
Pattern pattern = entry.getValue();
|
||||||
|
Matcher matcher = pattern.matcher(content);
|
||||||
|
|
||||||
|
while (matcher.find()) {
|
||||||
|
matched.add(word);
|
||||||
|
int start = offset + matcher.start();
|
||||||
|
int end = offset + matcher.end();
|
||||||
|
replacement.replace(start, end, Character.toString(replaceChar).repeat(word.length()));
|
||||||
|
offset += word.length() - (end - start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Process phrases in the content
|
||||||
|
|
||||||
|
return replacement;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Populate the word patterns
|
||||||
|
* for the given profanity list.
|
||||||
|
*
|
||||||
|
* @param profanityList the profanity list to use
|
||||||
|
*/
|
||||||
|
private void populatePatterns(@NonNull ProfanityList profanityList) {
|
||||||
|
for (Map.Entry<Language, List<String>> entry : profanityList.getProfaneWords().entrySet()) {
|
||||||
|
for (String word : entry.getValue()) {
|
||||||
|
wordPatterns.put(word, Pattern.compile(buildCombinedRegex(word), Pattern.CASE_INSENSITIVE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a regex pattern for the given word.
|
||||||
|
* <p>
|
||||||
|
* This pattern will match the exact, and
|
||||||
|
* obfuscated versions of the word.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param word the word to build for
|
||||||
|
* @return the built regex pattern
|
||||||
|
*/
|
||||||
|
@NonNull
|
||||||
|
private String buildCombinedRegex(@NonNull String word) {
|
||||||
|
StringBuilder exactWordRegex = new StringBuilder();
|
||||||
|
StringBuilder obfuscatedWordRegex = new StringBuilder();
|
||||||
|
|
||||||
|
for (char character : word.toCharArray()) {
|
||||||
|
char lowerChar = Character.toLowerCase(character);
|
||||||
|
exactWordRegex.append(lowerChar);
|
||||||
|
if (charSubstitutions.containsKey(lowerChar)) {
|
||||||
|
obfuscatedWordRegex.append('[').append(lowerChar).append(charSubstitutions.get(lowerChar)).append(']');
|
||||||
|
} else {
|
||||||
|
obfuscatedWordRegex.append(lowerChar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the pattern
|
||||||
|
return exactWordRegex + ((exactWordRegex.compareTo(obfuscatedWordRegex) == 0) ? "" : "|" + obfuscatedWordRegex);
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,13 +1,27 @@
|
|||||||
package me.braydon.profanity.service;
|
package me.braydon.profanity.service;
|
||||||
|
|
||||||
|
import com.google.gson.JsonArray;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import jakarta.annotation.PostConstruct;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.extern.log4j.Log4j2;
|
||||||
|
import me.braydon.profanity.TextPurifyAPI;
|
||||||
|
import me.braydon.profanity.common.ContentTag;
|
||||||
|
import me.braydon.profanity.common.Language;
|
||||||
|
import me.braydon.profanity.model.ProfanityList;
|
||||||
import me.braydon.profanity.model.input.ContentProcessInput;
|
import me.braydon.profanity.model.input.ContentProcessInput;
|
||||||
import me.braydon.profanity.model.response.ContentProcessResponse;
|
import me.braydon.profanity.model.response.ContentProcessResponse;
|
||||||
|
import me.braydon.profanity.processor.TextProcessor;
|
||||||
|
import me.braydon.profanity.processor.impl.VulgarityProcessor;
|
||||||
import me.braydon.profanity.repository.ProfanityListRepository;
|
import me.braydon.profanity.repository.ProfanityListRepository;
|
||||||
|
import org.apache.commons.text.StringEscapeUtils;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.net.URL;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This service is responsible
|
* This service is responsible
|
||||||
@ -15,17 +29,89 @@ import java.util.ArrayList;
|
|||||||
*
|
*
|
||||||
* @author Braydon
|
* @author Braydon
|
||||||
*/
|
*/
|
||||||
@Service
|
@Service @Log4j2(topic = "Filtration Service")
|
||||||
public final class FiltrationService {
|
public final class FiltrationService {
|
||||||
|
/**
|
||||||
|
* The profanity list repository to use.
|
||||||
|
*/
|
||||||
@NonNull private final ProfanityListRepository profanityListRepository;
|
@NonNull private final ProfanityListRepository profanityListRepository;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The profanity list to use.
|
||||||
|
*/
|
||||||
|
private ProfanityList profanityList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The registered text processors to use.
|
||||||
|
*/
|
||||||
|
@NonNull private final List<TextProcessor> textProcessors = Collections.synchronizedList(new ArrayList<>());
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) {
|
public FiltrationService(@NonNull ProfanityListRepository profanityListRepository) {
|
||||||
this.profanityListRepository = profanityListRepository;
|
this.profanityListRepository = profanityListRepository;
|
||||||
|
|
||||||
|
// Register text processors
|
||||||
|
textProcessors.add(new VulgarityProcessor());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Populate the database with
|
||||||
|
* default lists if empty.
|
||||||
|
*/
|
||||||
|
@PostConstruct @SneakyThrows
|
||||||
|
public void populateDefaults() {
|
||||||
|
long before = System.currentTimeMillis();
|
||||||
|
|
||||||
|
// List is already present
|
||||||
|
if ((profanityList = profanityListRepository.getProfanityList()) != null) {
|
||||||
|
log.info("Loaded lists in {}ms", System.currentTimeMillis() - before);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Download the pre-made lists
|
||||||
|
// for each language and save it.
|
||||||
|
log.info("Downloading pre-made lists...");
|
||||||
|
before = System.currentTimeMillis();
|
||||||
|
|
||||||
|
Map<Language, List<String>> profaneWords = new HashMap<>();
|
||||||
|
Map<Language, List<String>> profanePhrases = new HashMap<>();
|
||||||
|
for (Language lang : Language.values()) {
|
||||||
|
String contentUrl = "https://raw.githubusercontent.com/Rainnny7/TextPurify/master/lists/" + lang.name().toLowerCase() + ".json";
|
||||||
|
JsonArray content = TextPurifyAPI.GSON.fromJson(new Scanner(new URL(contentUrl).openStream(),
|
||||||
|
StandardCharsets.UTF_8
|
||||||
|
).useDelimiter("\\A").next(), JsonArray.class);
|
||||||
|
for (JsonElement item : content) {
|
||||||
|
String element = item.getAsString();
|
||||||
|
(element.contains(" ") ? profanePhrases : profaneWords).computeIfAbsent(lang, $ -> new ArrayList<>()).add(element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
profanityList = profanityListRepository.save(new ProfanityList("primary", new ArrayList<>(), profaneWords, profanePhrases));
|
||||||
|
log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before);
|
||||||
}
|
}
|
||||||
|
|
||||||
@NonNull
|
@NonNull
|
||||||
public ContentProcessResponse process(@NonNull ContentProcessInput input) {
|
public ContentProcessResponse process(@NonNull ContentProcessInput input) {
|
||||||
return new ContentProcessResponse(input.getContent(), new ArrayList<>(), new ArrayList<>(), 0D);
|
List<String> matched = new ArrayList<>(); // The content that was matched
|
||||||
|
List<ContentTag> tags = new ArrayList<>(); // Tags obtained from the processed content
|
||||||
|
StringBuilder replacement = new StringBuilder(input.getContent());
|
||||||
|
|
||||||
|
// Handle filtering if a profanity list is present
|
||||||
|
if (profanityList != null) {
|
||||||
|
String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
|
||||||
|
|
||||||
|
// Invoke each text processor on the content
|
||||||
|
for (TextProcessor textProcessor : textProcessors) {
|
||||||
|
int before = matched.size();
|
||||||
|
replacement = textProcessor.process(profanityList, content, replacement, input.getReplaceChar(), matched);
|
||||||
|
if (matched.size() > before) {
|
||||||
|
tags.add(textProcessor.getTag());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the score based on
|
||||||
|
// the matched profane content
|
||||||
|
double score = 0D;
|
||||||
|
|
||||||
|
return new ContentProcessResponse(replacement.toString(), matched, tags, score);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -10,6 +10,13 @@ logging:
|
|||||||
|
|
||||||
# Spring Configuration
|
# Spring Configuration
|
||||||
spring:
|
spring:
|
||||||
|
data:
|
||||||
|
# MongoDB Configuration
|
||||||
|
mongodb:
|
||||||
|
uri: "mongodb://textpurify:p4$$w0rd@localhost:27017"
|
||||||
|
database: "textpurify"
|
||||||
|
auto-index-creation: true # Automatically create collection indexes
|
||||||
|
|
||||||
# Don't serialize null values by default with Jackson
|
# Don't serialize null values by default with Jackson
|
||||||
jackson:
|
jackson:
|
||||||
default-property-inclusion: non_null
|
default-property-inclusion: non_null
|
||||||
|
Loading…
x
Reference in New Issue
Block a user