Support elongated words

This commit is contained in:
Braydon 2024-06-06 22:09:04 -04:00
parent 237b42afa5
commit cc80a10088
2 changed files with 15 additions and 7 deletions

@ -28,6 +28,7 @@ public final class VulgarityProcessor extends TextProcessor {
* Substitutions for characters in profane words. * Substitutions for characters in profane words.
*/ */
private static final Map<Character, List<Character>> charSubstitutions = Collections.synchronizedMap(new HashMap<>()); private static final Map<Character, List<Character>> charSubstitutions = Collections.synchronizedMap(new HashMap<>());
static { // Populate char substitutions static { // Populate char substitutions
charSubstitutions.put('e', Collections.singletonList('3')); charSubstitutions.put('e', Collections.singletonList('3'));
charSubstitutions.put('i', List.of('1', '!')); charSubstitutions.put('i', List.of('1', '!'));
@ -75,8 +76,9 @@ public final class VulgarityProcessor extends TextProcessor {
matched.add(word); matched.add(word);
int start = offset + matcher.start(); int start = offset + matcher.start();
int end = offset + matcher.end(); int end = offset + matcher.end();
replacement.replace(start, end, Character.toString(replaceChar).repeat(word.length())); String matchedWord = matcher.group();
offset += word.length() - (end - start); replacement.replace(start, end, Character.toString(replaceChar).repeat(matchedWord.length()));
offset += matchedWord.length() - (end - start);
} }
} }
@ -122,7 +124,7 @@ public final class VulgarityProcessor extends TextProcessor {
for (Character substitution : charSubstitutions.get(lowerChar)) { for (Character substitution : charSubstitutions.get(lowerChar)) {
chars.append(substitution); chars.append(substitution);
} }
obfuscatedWordRegex.append('[').append(chars).append(']'); obfuscatedWordRegex.append('[').append(chars).append("]+");
} else { } else {
obfuscatedWordRegex.append(lowerChar); obfuscatedWordRegex.append(lowerChar);
} }

@ -88,16 +88,22 @@ public final class FiltrationService {
log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before); log.info("Downloaded lists in {}ms", System.currentTimeMillis() - before);
} }
/**
* Filter the content in the given input.
*
* @param input the input to filter
* @return the response from filtering the content
*/
@NonNull @NonNull
public ContentProcessResponse process(@NonNull ContentProcessInput input) { public ContentProcessResponse process(@NonNull ContentProcessInput input) {
String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
List<String> matched = new ArrayList<>(); // The content that was matched List<String> matched = new ArrayList<>(); // The content that was matched
List<ContentTag> tags = new ArrayList<>(); // Tags obtained from the processed content List<ContentTag> tags = new ArrayList<>(); // Tags obtained from the processed content
StringBuilder replacement = new StringBuilder(input.getContent()); StringBuilder replacement = new StringBuilder(input.getContent());
// Handle filtering if a profanity list is present // Handle filtering if a profanity list is present
if (profanityList != null) { if (profanityList != null) {
String content = StringEscapeUtils.escapeJava(input.getContent()).toLowerCase().trim(); // The content to filter
// Invoke each text processor on the content // Invoke each text processor on the content
for (TextProcessor textProcessor : textProcessors) { for (TextProcessor textProcessor : textProcessors) {
int before = matched.size(); int before = matched.size();
@ -109,8 +115,8 @@ public final class FiltrationService {
} }
// Calculate the score based on // Calculate the score based on
// the matched profane content // the matched profane content, that cannot be bypassed by changing the content length
double score = 0D; double score = Math.min(matched.stream().mapToDouble(String::length).sum() / content.length(), 1D);
return new ContentProcessResponse(replacement.toString(), matched, tags, score); return new ContentProcessResponse(replacement.toString(), matched, tags, score);
} }