Commit 9d66d59c by fukai

Initial SenseWord Lib

parents
File added
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
<classpathentry kind="output" path="target/classes"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>senstiveword</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/test/java=UTF-8
encoding/<project>=UTF-8
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
org.eclipse.jdt.core.compiler.debug.localVariable=generate
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.8
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
File added
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.brilliace.word</groupId>
<artifactId>senstiveword</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>senstiveword</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>sax</groupId>
<artifactId>sax</artifactId>
<version>2.0.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<finalName>senstiveword</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
File added
File added
package SymSpell;// MIT License
//
// Copyright (c) 2018 Hampus Londögård
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
import java.util.Arrays;
import SymSpell.SuggestionStage.Node;
// A growable list of elements that's optimized to support adds, but not deletes,
// of large numbers of elements, storing data in a way that's friendly to the garbage
// collector (not backed by a monolithic array object), and can grow without needing
// to copy the entire backing array contents from the old backing array to the new.
public class ChunkArray<T>
{
private static int chunkSize = 4096;//this must be a power of 2, otherwise can't optimize row and col functions
private static int divShift = 12; // number of bits to shift right to do division by chunkSize (the bit position of chunkSize)
public Node[][] values; // Note: Node (SymSpell.SuggestionStage.Node) is found in SymSpell.SymSpell.java.
public int count;
ChunkArray(int initialCapacity)
{
int chunks = (initialCapacity + chunkSize - 1) / chunkSize;
values = new Node[chunks][];
for (int i = 0; i < values.length; i++) values[i] = new Node[chunkSize];
}
public int add(Node value)
{
if (count == capacity()) {
Node[][] newValues = Arrays.copyOf(values, values.length + 1);
newValues[values.length] = new Node[chunkSize];
values = newValues;
}
values[row(count)][col(count)] = value;
count++;
return count - 1;
}
public void clear()
{
count = 0;
}
public Node getValues(int index) {
return values[row(index)][col(index)];
}
public void setValues(int index, Node value){
values[row(index)][col(index)] = value;
}
public void setValues(int index, Node value, Node[][] list){
list[row(index)][col(index)] = value;
}
private int row(int index) { return index >> divShift; } // same as index / chunkSize
private int col(int index) { return index & (chunkSize - 1); } //same as index % chunkSize
private int capacity() { return values.length * chunkSize; }
}
\ No newline at end of file
package SymSpell;// MIT License
//
// Copyright (c) 2018 Hampus Londögård
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
public class EditDistance {
public enum DistanceAlgorithm{
Damerau
}
private String baseString;
private DistanceAlgorithm algorithm;
private int[] v0;
private int[] v2;
/// <summary>Create a new EditDistance object.</summary>
/// <param name="baseString">The base string to which other strings will be compared.</param>
/// <param name="algorithm">The desired edit distance algorithm.</param>
EditDistance(String baseString, DistanceAlgorithm algorithm)
{
this.baseString = baseString;
this.algorithm = algorithm;
if (this.baseString.isEmpty()) {
this.baseString = null;
return;
}
if (algorithm == DistanceAlgorithm.Damerau) {
v0 = new int[baseString.length()];
v2 = new int[baseString.length()]; // stores one level further back (offset by +1 position)
}
}
// <summary>compare a string to the base string to determine the edit distance,
/// using the previously selected algorithm.</summary>
/// <param name="string2">The string to compare.</param>
/// <param name="maxDistance">The maximum distance allowed.</param>
/// <returns>The edit distance (or -1 if maxDistance exceeded).</returns>
public int compare(String string2, int maxDistance) {
switch (algorithm) {
case Damerau: return DamerauLevenshteinDistance(string2, maxDistance);
}
throw new IllegalArgumentException("unknown DistanceAlgorithm");
}
// stores one level further back (offset by +1 position)
/// <param name="string1">String being compared for distance.</param>
/// <param name="string2">String being compared against other string.</param>
/// <param name="maxDistance">The maximum edit distance of interest.</param>
/// <returns>int edit distance, >= 0 representing the number of edits required
/// to transform one string to the other, or -1 if the distance is greater than the specified maxDistance.</returns>
public int DamerauLevenshteinDistance(String string2, int maxDistance) {
if (baseString == null) return string2 == null ? 0 : string2.length(); //string2 ?? "").Length;
if (string2 == null || string2.isEmpty()) return baseString.length();
// if strings of different lengths, ensure shorter string is in string1. This can result in a little
// faster speed by spending more time spinning just the inner loop during the main processing.
String string1;
if (baseString.length() > string2.length()) {
string1 = string2;
string2 = baseString;
} else {
string1 = baseString;
}
int sLen = string1.length(); // this is also the minimun length of the two strings
int tLen = string2.length();
// suffix common to both strings can be ignored
while ((sLen > 0) && (string1.charAt(sLen - 1) == string2.charAt(tLen - 1))) { sLen--; tLen--; }
int start = 0;
if ((string1.charAt(0) == string2.charAt(0)) || (sLen == 0)) { // if there'string1 a shared prefix, or all string1 matches string2'string1 suffix
// prefix common to both strings can be ignored
while ((start < sLen) && (string1.charAt(start) == string2.charAt(start))) start++;
sLen -= start; // length of the part excluding common prefix and suffix
tLen -= start;
// if all of shorter string matches prefix and/or suffix of longer string, then
// edit distance is just the delete of additional characters present in longer string
if (sLen == 0) return tLen;
string2 = string2.substring(start, start + tLen); // faster than string2[start+j] in inner loop below
}
int lenDiff = tLen - sLen;
if ((maxDistance < 0) || (maxDistance > tLen)) {
maxDistance = tLen;
} else if (lenDiff > maxDistance) return -1;
if (tLen > v0.length)
{
v0 = new int[tLen];
v2 = new int[tLen];
} else {
for(int i = 0; i < tLen; i++) v2[i] = 0; // Substituting Array.clear(v2, 0, tLen)
}
int j;
for (j = 0; j < maxDistance; j++) v0[j] = j + 1;
for (; j < tLen; j++) v0[j] = maxDistance + 1;
int jStartOffset = maxDistance - (tLen - sLen);
boolean haveMax = maxDistance < tLen;
int jStart = 0;
int jEnd = maxDistance;
char sChar = string1.charAt(0);
int current = 0;
for (int i = 0; i < sLen; i++) {
char prevsChar = sChar;
sChar = string1.charAt(start+i);
char tChar = string2.charAt(0);
int left = i;
current = left + 1;
int nextTransCost = 0;
// no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff)
// and the upper left diagonal + maxDistance cells (upper left is i)
jStart += (i > jStartOffset) ? 1 : 0;
jEnd += (jEnd < tLen) ? 1 : 0;
for (j = jStart; j < jEnd; j++) {
int above = current;
int thisTransCost = nextTransCost;
nextTransCost = v2[j];
v2[j] = current = left; // cost of diagonal (substitution)
left = v0[j]; // left now equals current cost (which will be diagonal at next iteration)
char prevtChar = tChar;
tChar = string2.charAt(j);
if (sChar != tChar) {
if (left < current) current = left; // insertion
if (above < current) current = above; // deletion
current++;
if ((i != 0) && (j != 0)
&& (sChar == prevtChar)
&& (prevsChar == tChar)) {
thisTransCost++;
if (thisTransCost < current) current = thisTransCost; // transposition
}
}
v0[j] = current;
}
if (haveMax && (v0[i + lenDiff] > maxDistance)) return -1;
}
return (current <= maxDistance) ? current : -1;
}
}
package SymSpell;// MIT License
//
// Copyright (c) 2018 Hampus Londögård
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
import java.util.Comparator;
public class SuggestItem implements Comparator<SuggestItem>, Comparable<SuggestItem>
{
/// <summary>The suggested correctly spelled word.</summary>
public String term;
/// <summary>Edit distance between searched for word and suggestion.</summary>
public int distance;
/// <summary>Frequency of suggestion in the dictionary (a measure of how common the word is).</summary>
public long count;
/// <summary>Create a new instance of SymSpell.SuggestItem.</summary>
/// <param name="term">The suggested word.</param>
/// <param name="distance">Edit distance from search word.</param>
/// <param name="count">Frequency of suggestion in dictionary.</param>
public SuggestItem(String term, int distance, long count) {
this.term = term;
this.distance = distance;
this.count = count;
}
@Override
public int compare(SuggestItem suggestItem, SuggestItem t1) {
return suggestItem.compareTo(t1);
}
@Override
public boolean equals(Object obj) {
return obj instanceof SuggestItem && term.equals(((SuggestItem) obj).term);
}
@Override
public int hashCode()
{
return term.hashCode();
}
@Override
public String toString()
{
return "{" + term + ", " + distance + ", " + count + "}";
}
@Override
public int compareTo(SuggestItem other) {
// order by distance ascending, then by frequency count descending
if (this.distance == other.distance) return Long.compare(other.count, this.count);
return Integer.compare(this.distance, other.distance);
}
public SuggestItem clone(){
return new SuggestItem(this.term, this.distance, this.count);
}
}
package SymSpell;
// MIT License
//
// Copyright (c) 2018 Hampus Londögård
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/// <summary>An intentionally opacque class used to temporarily stage
/// dictionary data during the adding of many words. By staging the
/// data during the building of the dictionary data, significant savings
/// of time can be achieved, as well as a reduction in final memory usage.</summary>
public class SuggestionStage {
SuggestionStage(int initialCapacity) {
deletes = new HashMap<Integer, Entry>(initialCapacity);
nodes = new ChunkArray<Node>(initialCapacity * 2);
}
public class Node {
public String suggestion;
public int next;
public Node(String suggestion, int next) {
this.suggestion = suggestion;
this.next = next;
}
}
public class Entry {
public int count;
public int first;
Entry(int count, int first) {
this.count = count;
this.first = first;
}
}
public Map<Integer, Entry> deletes; // {get; set; }
public ChunkArray<Node> nodes;
/// <summary>Create a new instance of SymSpell.SuggestionStage.</summary>
/// <remarks>Specifying ann accurate initialCapacity is not essential,
/// but it can help speed up processing by aleviating the need for
/// data restructuring as the size grows.</remarks>
/// <param name="initialCapacity">The expected number of words that will be added.</param>
/// <summary>Gets the count of unique delete words.</summary>
public int deleteCount() { return deletes.size(); }
/// <summary>Gets the total count of all suggestions for all deletes.</summary>
public int nodeCount() { return nodes.count; }
/// <summary>Clears all the data from the SuggestionStaging.</summary>
public void clear() {
deletes.clear();
nodes.clear();
}
void add(int deleteHash, String suggestion) {
Entry entry = deletes.getOrDefault(deleteHash, new Entry(0, -1));
int next = entry.first;
entry.count++;
entry.first = nodes.count;
deletes.put(deleteHash, entry);
nodes.add(new Node(suggestion, next));
}
void commitTo(Map<Integer, String[]> permanentDeletes) {
for(java.util.Map.Entry<Integer, Entry> entry:deletes.entrySet()){
//deletes.forEach((key, value) -> {
Integer key = entry.getKey();
Entry value = entry.getValue();
int i;
String[] suggestions;
if (permanentDeletes.containsKey(key)) {
suggestions = permanentDeletes.get(key);
i = suggestions.length;
String[] newSuggestion = Arrays.copyOf(suggestions, i + value.count);
permanentDeletes.put(key, newSuggestion);
suggestions = newSuggestion;
} else {
i = 0;
suggestions = new String[value.count];
permanentDeletes.put(key, suggestions);
}
int next = value.first;
Node node;
while (next >= 0) {
node = nodes.getValues(next);
suggestions[i] = node.suggestion;
next = node.next;
i++;
}
};
}
}
\ No newline at end of file
package SymSpell;// MIT License
//
// Copyright (c) 2018 Hampus Londögård
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;
public class SymSpellDemo {
private int termIndex = 0;
private int countIndex = 1;
private String path ="data/frequency_dictionary_en_82_765.txt";
private SymSpell.Verbosity suggestionVerbosity = SymSpell.Verbosity.All; //Top, Closest, All
private int maxEditDistanceLookup; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary)
private SymSpell symSpell;
private SymSpellDemo(int maxEditDistanceLookup) throws FileNotFoundException {
symSpell = new SymSpell(-1, maxEditDistanceLookup, -1, 10);//, (byte)18);
this.maxEditDistanceLookup = maxEditDistanceLookup;
if(!symSpell.loadDictionary(path, termIndex, countIndex))throw new FileNotFoundException("File not found");
}
private List<SuggestItem> lookup(String input){
return symSpell.lookup(input, suggestionVerbosity, maxEditDistanceLookup);
}
private SuggestItem lookupCompound(String input){
return symSpell.lookupCompound(input, maxEditDistanceLookup).get(0);
}
public static void main(String[] args) throws IOException {
SymSpellDemo symSpell = new SymSpellDemo(3);
//verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found
//verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency
//verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination)
// IE All is the only one to give suggestions if a word with exact match is found.
String inputTerm;
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
while(true){
System.out.println("Enter input:");
inputTerm = br.readLine();
long beg = System.currentTimeMillis();
List<SuggestItem> suggestions = symSpell.lookup(inputTerm);
SuggestItem compound = symSpell.lookupCompound(inputTerm);
long end = System.currentTimeMillis();
System.out.println("one word:"+(end - beg));
for(SuggestItem suggestion:suggestions)
{
System.out.println("Lookup suggestion: " + suggestion.term + " " + suggestion.distance + " " + suggestion.count);
}
// suggestions.stream()
// .limit(10)
// .forEach(suggestion -> System.out.println("Lookup suggestion: " + suggestion.term + " " + suggestion.distance + " " + suggestion.count));
System.out.println("LookupCompound: " + compound.term);
}
}
}
package SymSpell;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.brilliance.word.blacklist.Utils;
import com.brilliance.word.blacklist.interf.IQuery;
import com.brilliance.word.blacklist.interf.IWordStore;
import com.brilliance.word.blacklist.search.HitSuggest;
public class SymSpellQuery implements IQuery,IWordStore{
SymSpell symSpell ;
Map<String,Object[]> wordsMap = new HashMap<String,Object[]>(2000);
SuggestionStage suggstage = new SuggestionStage(16384);
int maxEditDistance = 3;
public SymSpellQuery(int maxEditDistance)
{
symSpell = new SymSpell(-1, maxEditDistance, -1, 10);
this.maxEditDistance = maxEditDistance;
}
@Override
public int put(String word, String term) {
// TODO Auto-generated method stub
Object[] arr = wordsMap.get(word);
if(arr != null)
{
((Set<String>) arr[0]).add(term);
return (Integer) arr[1];
}
Set<String> words = new HashSet<String>();
words.add(term);
int cnt = Utils.WORD_COUNTER ++ ;
wordsMap.put(word,new Object[]{ words,cnt});
symSpell.createDictionaryEntry(word, cnt, suggstage);
return cnt;
}
public void commitStaged()
{
if (symSpell.deletes == null)
symSpell.deletes = new HashMap<Integer,String[]>(suggstage.deleteCount());
symSpell.commitStaged(suggstage);
}
@Override
public HitSuggest query(String word) {
HitSuggest sugg = new HitSuggest();
List<SuggestItem> rs = symSpell.lookup(word, SymSpell.Verbosity.All, this.maxEditDistance);
for(SuggestItem item :rs)
{
sugg.addSuggest(item.term, item.distance,(int)item.count);
Object arr[] = this.wordsMap.get(item.term);
if(arr != null)
sugg.addBelong((Set<String>)arr[0]);
}
return sugg;
}
}
package com.brilliance.word.blacklist;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
public class BlackListDic {
private Map<String,Integer[]> blackWords = new HashMap<String,Integer[]>();
/**
*
* @param term 新增黑名单词条
* @param integers
*/
public void addBlackTerm(String term,Integer[] integers){
blackWords.put(term, integers);
}
/**
* 删除黑名单词条
* @param term
*/
public void removeBlackTerm(String term)
{
blackWords.remove(term);
}
/**
* 遍历器
* @return
*/
public Iterator<Map.Entry<String,Integer[]>> iterator()
{
return blackWords.entrySet().iterator();
}
public Integer[] getKeyWords(String term)
{
return blackWords.get(term);
}
}
package com.brilliance.word.blacklist;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import com.brilliance.word.blacklist.extend.simple.SimpleWordWalker;
import com.brilliance.word.blacklist.interf.IWordListener;
import com.brilliance.word.blacklist.search.BlackSearh;
import com.brilliance.word.blacklist.search.Result;
import SymSpell.SymSpellQuery;
public class BlackListShot {
private SymSpellQuery ssquery = null;
private BlackListDic blklist = null;
public BlackListShot(){
ssquery = new SymSpellQuery(Utils.MAX_ENABLE_DISTANCE);
blklist = new BlackListDic();
}
public void loadWords(Reader reader)
{
BufferedReader br = new BufferedReader(reader);
String line = null;
try{
long count = 0;
while((line = br.readLine())!=null)
{
final List<Integer> words = new ArrayList<Integer>();
SimpleWordWalker sww = new SimpleWordWalker();
int idIndex = line.indexOf('|');
String copy = line;
String id = null;
if(idIndex > 0)
{
id = line.substring(0, idIndex);
copy = line.substring(idIndex+1); // 从第一个分隔符开始获取
}
final String term = line;
final String ID = id;
sww.walk(new StringReader(copy),new IWordListener(){
@Override
public void onWord(String word, long begPos, long endPos) {
int cnt = ssquery.put(word, ID);
if(words.size() == 0)
words.add(word.length());
else
words.set(0,words.get(0) + word.length());
words.add(cnt);
}
});
blklist.addBlackTerm(ID, words.toArray(new Integer[0]));
count++;
if(count > 500000)
break;
}
}catch(Exception e){
e.printStackTrace();
}
ssquery.commitStaged();
}
/**
*
* @param term
* @return 返回最大命中率
*/
public double shot(String term,double threshold,List<Result> rs){
BlackSearh bs = new BlackSearh(term,this.ssquery,threshold,Utils.MAX_BREAK_TERM);
List<Result> rs2 = Collections.synchronizedList(rs);
double degree = bs.shot(blklist,rs2);
Collections.sort(rs, new Comparator<Result>(){
@Override
public int compare(Result o1, Result o2) {
if(o2.shotdegree == o1.shotdegree)
return 0;
return o2.shotdegree>o1.shotdegree?1:-1;
}});
return degree;
}
}
package com.brilliance.word.blacklist;
import java.io.BufferedReader;
import java.io.Reader;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import com.brilliance.word.blacklist.algorithm.DamerauEditDistance;
import com.brilliance.word.blacklist.algorithm.Distance2;
import com.brilliance.word.blacklist.extend.simple.SimpleBKTreeMaker;
import com.brilliance.word.blacklist.interf.IMetricSpace;
import com.brilliance.word.blacklist.interf.IWordBKTreeMaker;
import com.brilliance.word.blacklist.search.BlackSearh;
import com.brilliance.word.blacklist.search.Result;
import com.brilliance.word.blacklist.struct.BKTree;
public class BlackListShot0 {
public int getMaxEnableDistance(){
return Utils.MAX_ENABLE_DISTANCE;
}
public IMetricSpace generMetricSpace()
{
return new Distance2();
// return new DamerauEditDistance();
}
private BKTree bktree = null;
private BlackListDic blklist = null;
public BlackListShot0(){
bktree = new BKTree(
generMetricSpace(),
getMaxEnableDistance());
blklist = new BlackListDic();
}
public void loadWords(Reader reader)
{
IWordBKTreeMaker wbkt = new SimpleBKTreeMaker();
BufferedReader br = new BufferedReader(reader);
String line = null;
try{
while((line = br.readLine())!=null)
{
List<String> keyWords = wbkt.loadWords(line, bktree); //按行加载黑名词
//加载黑名单至内存
//blklist.addBlackTerm(line, keyWords.toArray(new String[0]));
}
}catch(Exception e){
e.printStackTrace();
}
}
/**
*
* @param term
* @return 返回最大命中率
*/
public double shot(String term,double threshold,List<Result> rs){
BlackSearh bs = new BlackSearh(term,bktree,threshold,Utils.MAX_BREAK_TERM);
List<Result> rs2 = Collections.synchronizedList(rs);
double degree = bs.shot(blklist,rs2);
Collections.sort(rs, new Comparator<Result>(){
@Override
public int compare(Result o1, Result o2) {
if(o2.shotdegree == o1.shotdegree)
return 0;
return o2.shotdegree>o1.shotdegree?1:-1;
}});
return degree;
}
}
\ No newline at end of file
package com.brilliance.word.blacklist;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import com.brilliance.word.blacklist.search.Result;
public class TestMain {
public static void main(String[] args) throws FileNotFoundException {
// TODO Auto-generated method stub
//加载黑名词
//查询黑名词
String term = "KIA ISa FU ";
System.out.println(term.length());
File path = Paths.get("data/id-name.txt").toFile();
List<Result> rs = new ArrayList<Result>();
BlackListShot0 bs = new BlackListShot0();
long timebeg = System.currentTimeMillis();
//bs.loadWords(new StringReader(words));
bs.loadWords(new FileReader(path));
long timeend1 = System.currentTimeMillis();
System.out.println("load word:"+(timeend1-timebeg));
double percent = bs.shot(term,80, rs);
long timeend2 = System.currentTimeMillis();
System.out.println("shot:"+(timeend2-timeend1));
System.out.println(percent);
if(rs.size() > 20)
rs = rs.subList(0, 20);
for(Result rsitem :rs)
{
System.out.println(rsitem);
}
}
}
package com.brilliance.word.blacklist;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import com.brilliance.word.blacklist.search.Result;
public class TestMain2 {
public static void main(String[] args) throws FileNotFoundException {
// String term = "KIA ISa FU A good boy goosed 13178KIA ISa FU XXXXXXX YYYYYYY ZZZZZZZ A good boy goosed 13178 KIA ISa FU A good boy goosed 13178 Mr. Johnson had never been up in an aerophane before and he had read a lot about air accidents, so one day when a friend offered to take him for a ride in his own small phane, Mr. Johnson was very worried about accepting. Finally, however, his friend persuaded him that it was very safe, and Mr. Johnson boarded the plane.His friend started the engine and began to taxi onto the runway of the airport. Mr. Johnson had heard that the most dangerous part of a flight were the take-off and the landing, so he was extremely frightened and closed his eyes.  I still think of how foolish I must have looked, as I gazed at you, that first time. I remember watching you intently, as you took off your hat and loosely shook your short dark hair with your fingers. I felt myself becoming immersed in your every detail, as you placed your hat on the table and cupped your hands around the hot cup of tea, gently blowing the steam away with your pouted lips.";
String term = "Jean-Loris Bokassa is a good boy Jean-Loris Bokassa hahahhahahaha adaad adadad ada dada Jean-Loris Bokassa";
System.out.println(term.length());
File path = Paths.get("data/id-name.txt").toFile();
List<Result> rs = new ArrayList<Result>();
BlackListShot bs = new BlackListShot();
long timebeg = System.currentTimeMillis();
//bs.loadWords(new StringReader(words));
bs.loadWords(new FileReader(path));
long timeend1 = System.currentTimeMillis();
System.out.println("load word:"+(timeend1-timebeg));
double percent = bs.shot(term,80, rs);
long timeend2 = System.currentTimeMillis();
System.out.println("shot:"+(timeend2-timeend1));
System.out.println(percent);
if(rs.size() > 20)
rs = rs.subList(0, 20);
for(Result rsitem :rs)
{
System.out.println(rsitem);
}
}
}
package com.brilliance.word.blacklist;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class Utils {
/**
* 最大黑词中断数
*/
public static int MAX_BREAK_TERM = 1;
/**
* BKTre 最大允许编辑距离,即最大查词距离
*/
public static int MAX_ENABLE_DISTANCE = 2;
public static final char chdis = 'a' - 'A' ;
/**
* 停顿词
*/
public static final Set<String> stopWords = new HashSet<String>();
public static final char[] skipChars = {'%','|','#'};
public static final char[] breakChars = {'.','\t','\r','\n',';'," ".charAt(0) };
static{
String[] stopStrings="the,of,at,a,an,another,there,here,while,when,is,am,are,&".split(",");
Arrays.sort(skipChars);
Arrays.sort(breakChars);
stopWords.addAll(Arrays.asList(stopStrings));
}
public static ExecutorService executor = Executors.newCachedThreadPool();
public static int WORD_COUNTER= 11;
public static final boolean WHOLD_WORD = true;
}
package com.brilliance.word.blacklist.algorithm;
import com.brilliance.word.blacklist.interf.IMetricSpace;
public class DamerauEditDistance implements IMetricSpace {
public DamerauEditDistance ()
{
}
@Override
public double distance(String a, String b) {
return distance(a,b,-1);
}
public double distance(String a, String b,int maxDistance) {
EditDistance edtdis = new EditDistance(a);
int distance = edtdis.compare(b, maxDistance);
return distance;
}
}
package com.brilliance.word.blacklist.algorithm;
import java.util.ArrayList;
import java.util.List;
import com.brilliance.word.blacklist.interf.IMetricSpace;
/**
* Huge list of different algos, in Java:
* http://www.dcs.shef.ac.uk/~sam/stringmetrics.html
*
* Simplified "similarity", even more logical matching, very fast:
* http://www.catalysoft.com/articles/StrikeAMatch.html
*
* @author Fuad Efendi
*
*/
public class Distance2 implements IMetricSpace {
public Distance2() {
}
public int getDistance(Object object1, Object object2) {
String string1 = (String) object1;
String string2 = (String) object2;
// Simplified, and very fast!
// see http://www.catalysoft.com/articles/StrikeAMatch.html
// return compareStrings(string1, string2);
// Lucene 3.0:
return getDistance(string1, string2);
}
// ////////////////////////////////////////
// From Lucene 3.0 trunk, "contrib" folder:
// *****************************
// Compute Levenshtein distance: see
// org.apache.commons.lang.StringUtils#getLevenshteinDistance(String,
// String)
// *****************************
public int getDistance(String target, String other) {
char[] sa;
int n;
int p[]; // 'previous' cost array, horizontally
int d[]; // cost array, horizontally
int _d[]; // placeholder to assist in swapping p and d
/*
* The difference between this impl. and the previous is that, rather
* than creating and retaining a matrix of size s.length()+1 by
* t.length()+1, we maintain two single-dimensional arrays of length
* s.length()+1. The first, d, is the 'current working' distance array
* that maintains the newest distance cost counts as we iterate through
* the characters of String s. Each time we increment the index of
* String t we are comparing, d is copied to p, the second int[]. Doing
* so allows us to retain the previous cost counts as required by the
* algorithm (taking the minimum of the cost count to the left, up one,
* and diagonally up and to the left of the current cost count being
* calculated). (Note that the arrays aren't really copied anymore, just
* switched...this is clearly much better than cloning an array or doing
* a System.arraycopy() each time through the outer loop.)
*
* Effectively, the difference between the two implementations is this
* one does not cause an out of memory condition when calculating the LD
* over two very large strings.
*/
sa = target.toCharArray();
n = sa.length;
p = new int[n + 1];
d = new int[n + 1];
final int m = other.length();
if (n == 0 || m == 0) {
if (n == m) {
return 1;
} else {
return 0;
}
}
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
char t_j; // jth character of t
int cost; // cost
for (i = 0; i <= n; i++) {
p[i] = i;
}
for (j = 1; j <= m; j++) {
t_j = other.charAt(j - 1);
d[0] = j;
for (i = 1; i <= n; i++) {
cost = sa[i - 1] == t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left
// and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}
/**
* http://www.catalysoft.com/articles/StrikeAMatch.html
*
* @return an array of adjacent letter pairs contained in the input string
*/
private static List<String> letterPairs(String str) {
List<String> pairs = new ArrayList<String>();
for (int i = 0; i < str.length() - 1; i++) {
pairs.add(str.substring(i, i + 2));
}
return pairs;
}
/**
* http://www.catalysoft.com/articles/StrikeAMatch.html
*
* @return lexical similarity value in the range [0,1]
*/
public static int compareStrings(String str1, String str2) {
List<String> pairs1 = letterPairs(str1);
List<String> pairs2 = letterPairs(str2);
int intersection = 0;
int union = pairs1.size() + pairs2.size();
for (int i = 0; i < pairs1.size(); i++) {
for (int j = 0; j < pairs2.size(); j++) {
if (pairs1.get(i).equals(pairs2.get(j))) {
intersection++;
pairs2.remove(j);
break;
}
}
}
return union - (intersection + 1) * 2;
}
@Override
public double distance(String a, String b) {
// TODO Auto-generated method stub
return getDistance(a,b);
}
@Override
public double distance(String a, String b, int maxDistance) {
// TODO Auto-generated method stub
return getDistance(a,b);
}
}
\ No newline at end of file
package com.brilliance.word.blacklist.algorithm;// MIT License
//
// Copyright (c) 2018 Hampus Londögård
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
public class EditDistance {
public enum DistanceAlgorithm{
Damerau
}
private String baseString;
private DistanceAlgorithm algorithm;
private int[] v0;
private int[] v2;
EditDistance(String baseString)
{
this(baseString,DistanceAlgorithm.Damerau);
}
/// <summary>Create a new EditDistance object.</summary>
/// <param name="baseString">The base string to which other strings will be compared.</param>
/// <param name="algorithm">The desired edit distance algorithm.</param>
EditDistance(String baseString, DistanceAlgorithm algorithm)
{
this.baseString = baseString;
this.algorithm = algorithm;
if (this.baseString.isEmpty()) {
this.baseString = null;
return;
}
if (algorithm == DistanceAlgorithm.Damerau) {
v0 = new int[baseString.length()];
v2 = new int[baseString.length()]; // stores one level further back (offset by +1 position)
}
}
// <summary>compare a string to the base string to determine the edit distance,
/// using the previously selected algorithm.</summary>
/// <param name="string2">The string to compare.</param>
/// <param name="maxDistance">The maximum distance allowed.</param>
/// <returns>The edit distance (or -1 if maxDistance exceeded).</returns>
public int compare(String string2, int maxDistance) {
switch (algorithm) {
case Damerau: return DamerauLevenshteinDistance(string2, maxDistance);
}
throw new IllegalArgumentException("unknown DistanceAlgorithm");
}
// stores one level further back (offset by +1 position)
/// <param name="string1">String being compared for distance.</param>
/// <param name="string2">String being compared against other string.</param>
/// <param name="maxDistance">The maximum edit distance of interest.</param>
/// <returns>int edit distance, >= 0 representing the number of edits required
/// to transform one string to the other, or -1 if the distance is greater than the specified maxDistance.</returns>
public int DamerauLevenshteinDistance(String string2, int maxDistance) {
if (baseString == null) return string2 == null ? 0 : string2.length(); //string2 ?? "").Length;
if (string2 == null || string2.isEmpty()) return baseString.length();
// if strings of different lengths, ensure shorter string is in string1. This can result in a little
// faster speed by spending more time spinning just the inner loop during the main processing.
String string1;
if (baseString.length() > string2.length()) {
string1 = string2;
string2 = baseString;
} else {
string1 = baseString;
}
int sLen = string1.length(); // this is also the minimun length of the two strings
int tLen = string2.length();
// suffix common to both strings can be ignored
while ((sLen > 0) && (string1.charAt(sLen - 1) == string2.charAt(tLen - 1))) { sLen--; tLen--; }
int start = 0;
if ((string1.charAt(0) == string2.charAt(0)) || (sLen == 0)) { // if there'string1 a shared prefix, or all string1 matches string2'string1 suffix
// prefix common to both strings can be ignored
while ((start < sLen) && (string1.charAt(start) == string2.charAt(start))) start++;
sLen -= start; // length of the part excluding common prefix and suffix
tLen -= start;
// if all of shorter string matches prefix and/or suffix of longer string, then
// edit distance is just the delete of additional characters present in longer string
if (sLen == 0) return tLen;
string2 = string2.substring(start, start + tLen); // faster than string2[start+j] in inner loop below
}
int lenDiff = tLen - sLen;
if ((maxDistance < 0) || (maxDistance > tLen)) {
maxDistance = tLen;
} else if (lenDiff > maxDistance) return -1;
if (tLen > v0.length)
{
v0 = new int[tLen];
v2 = new int[tLen];
} else {
for(int i = 0; i < tLen; i++) v2[i] = 0; // Substituting Array.clear(v2, 0, tLen)
}
int j;
for (j = 0; j < maxDistance; j++) v0[j] = j + 1;
for (; j < tLen; j++) v0[j] = maxDistance + 1;
int jStartOffset = maxDistance - (tLen - sLen);
boolean haveMax = maxDistance < tLen;
int jStart = 0;
int jEnd = maxDistance;
char sChar = string1.charAt(0);
int current = 0;
for (int i = 0; i < sLen; i++) {
char prevsChar = sChar;
sChar = string1.charAt(start+i);
char tChar = string2.charAt(0);
int left = i;
current = left + 1;
int nextTransCost = 0;
// no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff)
// and the upper left diagonal + maxDistance cells (upper left is i)
jStart += (i > jStartOffset) ? 1 : 0;
jEnd += (jEnd < tLen) ? 1 : 0;
for (j = jStart; j < jEnd; j++) {
int above = current;
int thisTransCost = nextTransCost;
nextTransCost = v2[j];
v2[j] = current = left; // cost of diagonal (substitution)
left = v0[j]; // left now equals current cost (which will be diagonal at next iteration)
char prevtChar = tChar;
tChar = string2.charAt(j);
if (sChar != tChar) {
if (left < current) current = left; // insertion
if (above < current) current = above; // deletion
current++;
if ((i != 0) && (j != 0)
&& (sChar == prevtChar)
&& (prevsChar == tChar)) {
thisTransCost++;
if (thisTransCost < current) current = thisTransCost; // transposition
}
}
v0[j] = current;
}
if (haveMax && (v0[i + lenDiff] > maxDistance)) return -1;
}
return (current <= maxDistance) ? current : -1;
}
}
package com.brilliance.word.blacklist.extend.simple;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import com.brilliance.word.blacklist.interf.IWordBKTreeMaker;
import com.brilliance.word.blacklist.interf.IWordListener;
import com.brilliance.word.blacklist.struct.BKTree;
public class SimpleBKTreeMaker implements IWordBKTreeMaker {
@Override
public List<String> loadWords(final String term,final BKTree bktree) {
final List<String> words = new ArrayList<String>();
SimpleWordWalker sww = new SimpleWordWalker();
sww.walk(new StringReader(term),new IWordListener(){
@Override
public void onWord(String word, long begPos, long endPos) {
bktree.put(word,term);
words.add(word);
}
});
return words;
}
}
package com.brilliance.word.blacklist.extend.simple;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import com.brilliance.word.blacklist.Utils;
import com.brilliance.word.blacklist.interf.IWordListener;
import com.brilliance.word.blacklist.interf.IWordWalker;
public class SimpleWordWalker implements IWordWalker {
private long begPos = 0;
private long cnt = 0;
@Override
public void walk(Reader reader,IWordListener listener)
{
try{
int c = -1;
StringBuilder sb = new StringBuilder();
while((c=reader.read()) > 0)
{
consumeChars(reader,listener,sb,(char) c);
}
if(sb.length() > 0)//结束时,捕获到新词了
{
long endPos = begPos+cnt;
onWord(listener,sb,begPos,endPos);
}
begPos = 0;
cnt = 0;
}catch(Exception e)
{
e.printStackTrace();
}
}
void consumeChars(Reader reader,IWordListener listener,StringBuilder sb,char c) throws IOException
{
cnt++;
if(c >= 'a' && c <= 'z' )
{
sb.append(c);
return;
}
else if(c >= 'A' && c <= 'Z' )
{
sb.append((char)(c + Utils.chdis));
return;
}
else if(c >= '0' && c <='9')
{
sb.append(c);
return;
}
else if(c == '.')
{
char oc = (char)reader.read();
if(oc >= '0' && oc <= '9' && sb.toString().equals("no"))
{
sb.append(c);
cnt++;
sb.append(oc);
}
else
{
long endPos = begPos+cnt-1;
onWord(listener,sb,begPos,endPos);
begPos = endPos+1; //设置新的起点
//消费新的字符
cnt = 0;
//继续处理该字符
consumeChars(reader,listener,sb, oc);
}
return ;
}
else if(Arrays.binarySearch(Utils.skipChars, c) >= 0) //忽略字符
{
if(sb.length() == 0)
{
begPos++; //非词中间忽略符,不计算开始位置
cnt =0;
}
return;
}
else if(Arrays.binarySearch(Utils.breakChars, c) >= 0) //中断字符
{
if(sb.length() > 0) //捕获到新词了
{
long endPos = begPos+cnt-1; //末尾索引
onWord(listener,sb,begPos,endPos);
begPos = endPos+1; //设置新的起点
cnt =0;
}
else
{
begPos++; //空白符,
cnt =0;
}
return;
}
else{
sb.append(c);
return;
}
}
void onWord(IWordListener listener,StringBuilder sb,long begPos,long endPos)
{
//新词
String word = sb.toString();
//是否为Stop Word
if(word.length() > 0 && !Utils.stopWords.contains(word))
listener.onWord(word, begPos, endPos);
sb.delete(0, sb.length());
}
}
package com.brilliance.word.blacklist.interf;
public interface IMetricSpace {
/***
* Calculate the distance between the two strings
* @param a the first string
* @param b the second string
* @return distance between the two strings
*/
double distance(String a, String b);
double distance(String a, String b,int maxDistance) ;
}
package com.brilliance.word.blacklist.interf;
import com.brilliance.word.blacklist.search.HitSuggest;
public interface IQuery {
public HitSuggest query(String word);
}
package com.brilliance.word.blacklist.interf;
import java.util.List;
import com.brilliance.word.blacklist.struct.BKTree;
public interface IWordBKTreeMaker {
/**
*
* @param term 单条黑名单
* @return
*/
List<String> loadWords(String term,BKTree btree);
}
package com.brilliance.word.blacklist.interf;
public interface IWordListener {
void onWord(String term,long begPos,long endPos);
}
package com.brilliance.word.blacklist.interf;
public interface IWordStore {
public int put(String word,String term);
}
package com.brilliance.word.blacklist.interf;
import java.io.Reader;
public interface IWordWalker {
void walk(Reader reader,IWordListener listener);
}
package com.brilliance.word.blacklist.search;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.FutureTask;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import com.brilliance.word.blacklist.BlackListDic;
import com.brilliance.word.blacklist.Utils;
import com.brilliance.word.blacklist.extend.simple.SimpleWordWalker;
import com.brilliance.word.blacklist.interf.IQuery;
import com.brilliance.word.blacklist.interf.IWordListener;
public class BlackSearh {
String term ;
IQuery query;
private int maxBreakTerm = 3;
int breaks=0;
List<HitWordGroup> sglst = new ArrayList<HitWordGroup>();
double threshold;
public BlackSearh(String term,IQuery query,double threshold,int maxBreakWords)
{
this(term,query,threshold);
this.maxBreakTerm = maxBreakWords;
}
public BlackSearh(String term,IQuery query,double threshold)
{
this.term = term;
this.query = query ;
this.threshold = threshold;
}
public double shot(final BlackListDic blklist,final List<Result> rs)
{
long timebeg = System.currentTimeMillis();
final List<FutureTask<Double>> taskList = new ArrayList<FutureTask<Double>>();
new SimpleWordWalker().walk(new StringReader(term), new IWordListener(){
@Override
public void onWord(final String word,final long begPos,final long endPos) {
HitSuggest item = query.query(word);
if(item.getItemSize() == 0) //非黑名词
{
BlackSearh.this.breaks ++; //中断数增加
if(BlackSearh.this.breaks > BlackSearh.this.maxBreakTerm)
{
if(taskList.size() < BlackSearh.this.sglst.size())
{
final HitWordGroup sgroup = BlackSearh.this.sglst.get(BlackSearh.this.sglst.size() -1);
FutureTask<Double> task = new FutureTask<Double>(new Callable<Double>(){
@Override
public Double call() throws Exception {
return sgroup.shot(blklist, rs);
}});
Utils.executor.submit(task);
taskList.add(task);
}
}
return ;
}
HitWord hw = new HitWord(item,begPos,endPos);
if(BlackSearh.this.sglst.size() == 0)
{
HitWordGroup sgroup = new HitWordGroup(term,threshold);
sgroup.addSuggest(hw);
BlackSearh.this.breaks = 0;
BlackSearh.this.sglst.add(sgroup);
return ;
}
if(BlackSearh.this.breaks <= BlackSearh.this.maxBreakTerm )
{
HitWordGroup sgroup = BlackSearh.this.sglst.get(BlackSearh.this.sglst.size() -1);
sgroup.addSuggest(hw);
}
else
{
HitWordGroup sgroup = new HitWordGroup(term,threshold);
sgroup.addSuggest(hw);
BlackSearh.this.breaks = 0;
BlackSearh.this.sglst.add(sgroup);
}
}
});
long timeend1 = System.currentTimeMillis();
System.out.println("query word:"+(timeend1-timebeg));
//计算命中率
long begtime = System.currentTimeMillis();
double deg = 0;
// for(HitWordGroup hw :sglst)
// {
// double ideg = hw.shot(blklist, rs);
// if(ideg > deg)
// deg = ideg;
// if(deg >= threshold)
// break;
// }
if(taskList.size() < BlackSearh.this.sglst.size())
{
HitWordGroup sgroup = BlackSearh.this.sglst.get(BlackSearh.this.sglst.size() -1);
deg = sgroup.shot(blklist, rs);
}
for(FutureTask<Double> task :taskList)
{
try {
double ideg =task.get();
if(deg < ideg)
deg = ideg;
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ExecutionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
long endtime = System.currentTimeMillis();
System.out.println("one shot:"+(endtime - begtime));
return deg;
}
}
package com.brilliance.word.blacklist.search;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
*
* @author fukai
* store BKTree filter items
*/
public class HitSuggest {
List<HitSuggestItem> suggest = new ArrayList<HitSuggestItem>();
Set<String> belong = new HashSet<String>();
/***
* 二分插入法
* @param sugg item
*/
public void addSuggest(HitSuggestItem sugg)
{
int low = 0;
int high = suggest.size() - 1;
while(low <= high)
{
int middle = (low+high) >>> 1;
HitSuggestItem item = suggest.get(middle);
if(sugg.getDis() == item.getDis())
{
suggest.add(middle,sugg);
break;
}
else if(sugg.getDis() > item.getDis())
{
low = middle+1;
}
else
{
high = middle-1;
}
}
suggest.add(low,sugg);
}
public synchronized void addSuggest(String value, double dis, int count) {
// TODO Auto-generated method stub
addSuggest(new HitSuggestItem(value,dis,count));
}
public synchronized void addBelong(Set<String> belong)
{
this.belong.addAll(belong);
}
public synchronized int getItemSize()
{
return this.suggest.size();
}
}
package com.brilliance.word.blacklist.search;
public class HitSuggestItem {
private final String value;
private final double dis;
private final int count;
public int getCount() {
return count;
}
public HitSuggestItem(String value, double dis, int count)
{
this.value=value;
this.dis = dis;
this.count = count;
}
public String getValue() {
return value;
}
public double getDis() {
return dis;
}
}
package com.brilliance.word.blacklist.search;
public class HitWord {
public final HitSuggest item;
public final long begPos;
public final long endPos;
public boolean isFit;
public HitWord(HitSuggest item,long begPos,long endPos)
{
this.item = item;
this.begPos = begPos;
this.endPos = endPos;
}
}
package com.brilliance.word.blacklist.search;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.FutureTask;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import com.brilliance.word.blacklist.BlackListDic;
import com.brilliance.word.blacklist.Utils;
public class HitWordGroup {
List<HitWord> items = new ArrayList<HitWord>();
int maxBreakWords = 3;
// String blackWord;
String source;
double threshold;
int shotCount = 0;
Map<String,List<Object[]>> cacheMap = new HashMap<String,List<Object[]>>();
public HitWordGroup(String source,double threshold)
{
this.source = source;
this.threshold = threshold;
}
public void addSuggest(HitSuggest item,long begPos,long endPos)
{
items.add(new HitWord(item,begPos,endPos));
}
public void addSuggest(HitWord hw)
{
items.add(hw);
}
public double shot(BlackListDic blklist,final List<Result> rs)
{
double degree = -1;
//与现有词组计算匹配度
Map<String,Integer> termCount = new HashMap<String,Integer>(30000); //计算语句Count
for(HitWord hw : items)
{
for(String bitem:hw.item.belong)
{
Integer intg = termCount.get(bitem);
if(intg == null)
termCount.put(bitem, 1);
else
termCount.put(bitem, intg + 1);
}
}
int cnt = 0;
this.shotCount = 0;
for(Map.Entry<String, Integer> entry:termCount.entrySet())
{
final String term = entry.getKey();
int count = entry.getValue();
final Integer[] keyWords = blklist.getKeyWords(term);
if(count < (keyWords.length -1 )*(threshold /100.0))
continue;
double ditem = fuzzyDegree(term,keyWords,rs);//单个匹配度
cnt++;
if(ditem > degree)
degree = ditem;
if(degree >= 99)
break;
}
System.out.println("one shot calc cnt:"+cnt);
System.out.println("----shot count-----:"+this.shotCount);
return degree;
}
public double fuzzyDegree(String blackWord,Integer[] keyWords,List<Result> rs)
{
//计算矩阵
int m = keyWords.length;
int n = items.size();
int[][][] a = new int[m-1][n][2];
int totalKeyWordLength = keyWords[0];
int allMinDis = 0;
int totCount =0;
for(int i=0;i<m-1;i++)
{
int kw = keyWords[i+1]; //因为第一个是
//keyWordLength += kw.length();
int minDis = -1;
for(int j=0;j<n;j++)
{
HitWord bw = items.get(j);
a[i][j] = null;
for(int k=0;k < bw.item.suggest.size();k++)
{
HitSuggestItem sitem = bw.item.suggest.get(k);
if(kw == sitem.getCount())
{
//词相等
a[i][j] = new int[]{(int)sitem.getDis(),sitem.getValue().length()};
if(minDis < sitem.getDis())
{
if(minDis < 0)
totCount += sitem.getValue().length();
minDis = (int)sitem.getDis();
}
}
}
}
if(minDis > 0)
allMinDis += minDis;
}
if(100 - allMinDis * 100.0 / totCount < this.threshold ) //词数差过大
return -1;
this.cacheMap.clear();
Integer[] copyArr = Arrays.copyOfRange(keyWords, 1, keyWords.length);
//double percent = calc( blackWord,copyArr,a,0,n,new ArrayList<Integer>(),0,0,rs,keyWordLength);
List<Object[]> args = calcCount(blackWord,copyArr,a,0,n,new ArrayList<Integer>());
double percent = 0;
for(Object[] item : args)
{
int keyWordLength = (Integer)item[0];
int wordDis = (Integer)item[1];
List<Integer> path = (List<Integer>)item[2];
Integer[] cols = path.toArray(new Integer[0]);
Arrays.sort(cols);
int sum = 0; //词位差
for(int i=cols.length - 1;i>1;i--)
sum = sum+cols[i]-cols[i-1];
int termDis = sum + (a.length - cols.length);//语句串=词位差+词数差
double sper = scalc(wordDis,termDis,keyWordLength,a.length,totalKeyWordLength);
if(sper>=threshold) //命中率至少0%
{
//从lck的第一个词的起始位置,至lck的末尾位置
long begPos = (int)items.get(
cols[0]).begPos;
long endPos = (int)items.get(
cols[cols.length - 1]).endPos;
Result rsitem = new Result();
rsitem.begpos = begPos;
rsitem.endpos = endPos;
rsitem.black = blackWord;
rsitem.shotdegree = sper;
rsitem.desstr = this.source.substring((int)begPos, (int)endPos);
rs.add(rsitem);
}
if(sper > percent)
percent = sper;
}
return percent;
}
private List<Object[]> getCacheCount(Integer[] kw,int m,int n,int[][][] arr,List<Integer> lck,StringBuilder sb)
{
for(int i=m;i<kw.length;i++)
{
boolean hasWordFit = false;
sb.append(kw[i]);
sb.append(',');
for(int j=0;j<n;j++)
{
if(arr[i][j]==null)
continue;
if(lck.contains(j))
continue;
hasWordFit=true;
sb.append(j);
sb.append(',');
}
if(!hasWordFit)
sb.append(-1);
sb.append(';');
}
return cacheMap.get(sb.toString());
}
private List<Object[]> calcCount(String blackWord,Integer[] kw,int[][][] arr,int m,int n,List<Integer> lck)
{
if(m == arr.length) //计算到最后了
{
return null; //终结
}
StringBuilder sb = new StringBuilder();
List<Object[]> cache = getCacheCount(kw,m,n,arr,lck,sb);
if(cache != null)
{
this.shotCount++ ;
return cache;
}
cache = new ArrayList<Object[]>();
for(int i = m ;i<arr.length;i++)
{
boolean hasWordFit = false;
for(int j=0;j<n;j++)
{
if(lck.contains(j))
continue;
if(arr[i][j]!=null)
{
hasWordFit = true;
lck.add(j); //占用J
List<Object[]> cache2 = calcCount(blackWord,kw,arr,i+1,n,lck);
if(cache2 == null)
{
Object[] nitem = new Object[3];
nitem[0] = arr[i][j][1]; //key word length
nitem[1] = arr[i][j][0]; //word dis count
nitem[2] = Arrays.asList(j);
cache.add(nitem);
}
else{
for(Object[] item:cache2)
{
Object[] nitem = new Object[item.length];
nitem[0] = (Integer)item[0] + arr[i][j][1]; //key word length
nitem[1] = (Integer)item[1] + arr[i][j][0]; //word dis count
List<Integer> path = new ArrayList<Integer>();
path.add(j);
path.addAll((List<Integer>) item[2]);
nitem[2] = path;
cache.add(nitem);
}
}
lck.remove(lck.size() - 1); //删除末尾元素,恢复
}
}
if(!hasWordFit) //当前黑词,全部为-1,即没有一个词匹配上,则子递归没有被调用
{
List<Object[]> cache2 = calcCount(blackWord,kw,arr,i+1,n,lck);
if(cache2 == null)
{
continue;
}
else
{
for(Object[] item:cache2)
{
Object[] nitem = new Object[item.length];
nitem[0] = item[0] ; //key word length
nitem[1] = item[1] ; //word dis count
List<Integer> path = new ArrayList<Integer>();
path.addAll((List<Integer>) item[2]); //对应的路径
nitem[2] = path;
cache.add(item);
}
}
}
}
if(cache.size() > 0)
{
this.cacheMap.put(sb.toString(), cache);
return cache;
}
return null;
}
private double calc(String blackWord,Integer[] kw,int[][][] arr,int m,int n,List<Integer> lck,int keyWordLength,int wordDis,List<Result> rs,int totalKeyWordLength)
{
if(m == arr.length) //计算到最后了
{
Integer[] cols = lck.toArray(new Integer[0]);
Arrays.sort(cols);
int sum = 0; //词位差
for(int i=cols.length - 1;i>1;i--)
sum = sum+cols[i]-cols[i-1];
int termDis = sum + (arr.length - cols.length);//语句串=词位差+词数差
double sper = scalc(wordDis,termDis,keyWordLength,arr.length,totalKeyWordLength);
if(sper>=threshold) //命中率至少0%
{
//从lck的第一个词的起始位置,至lck的末尾位置
long begPos = (int)items.get(
cols[0]).begPos;
long endPos = (int)items.get(
cols[cols.length - 1]).endPos;
Result rsitem = new Result();
rsitem.begpos = begPos;
rsitem.endpos = endPos;
rsitem.black = blackWord;
rsitem.shotdegree = sper;
rsitem.desstr = this.source.substring((int)begPos, (int)endPos);
rs.add(rsitem);
}
return sper;
}
double degree = -1;
for(int i = m ;i<arr.length;i++)
{
boolean hasWordFit = false;
for(int j=0;j<n;j++)
{
if(lck.contains(j))
continue;
if(arr[i][j]!=null)
{
hasWordFit = true;
lck.add(j); //占用J
double ndegree = calc(blackWord,kw,arr,i+1,n,lck,keyWordLength+arr[i][j][1],wordDis+arr[i][j][0],rs,totalKeyWordLength);
lck.remove(lck.size() - 1); //删除末尾元素,恢复
if(ndegree >= 99) //99就近似于100,后续不用再继续了
return ndegree;
if(degree < ndegree)
degree = ndegree;
if(degree >= 99)
return degree;
}
}
if(!hasWordFit) //当前黑词,全部为-1,即没有一个词匹配上,则子递归没有被调用
{
double ndegree = calc(blackWord,kw,arr,i+1,n,lck,keyWordLength,wordDis,rs,totalKeyWordLength);
if(ndegree >= 99) //99就近似于100,后续不用再继续了
return ndegree;
if(degree < ndegree)
degree = ndegree;
}
}
return degree;
}
private double scalc(int wordDistance,int termDistance,int keyWordLength,int keyWordCount,int totalKeyWordLength)
{
//计算相似度,公式可以调整
if(termDistance > keyWordCount )
return 0;
if(wordDistance > keyWordLength)
return 0;
double percent = (1-(double)wordDistance/keyWordLength)*(1-(double)termDistance/keyWordCount)*100*((double)keyWordLength/totalKeyWordLength);
return percent;
}
}
package com.brilliance.word.blacklist.search;
public class Result {
public double shotdegree;
public long begpos;
public long endpos;
public String black;
public String desstr;
public String toString(){
return String.format("%f#%s#%s", shotdegree,black,desstr);
}
}
package com.brilliance.word.blacklist.struct;
import com.brilliance.word.blacklist.interf.IMetricSpace;
import com.brilliance.word.blacklist.interf.IQuery;
import com.brilliance.word.blacklist.interf.IWordStore;
import com.brilliance.word.blacklist.search.HitSuggest;
public class BKTree implements IQuery,IWordStore{
private final IMetricSpace metricSpace;
private Node root;
private final int DEFAULT_RADIUS = 3;
private double radius = DEFAULT_RADIUS;
public BKTree(IMetricSpace metricSpace)
{
this.metricSpace = metricSpace;
}
public BKTree(IMetricSpace metricSpace,double radius)
{
this.metricSpace = metricSpace;
this.radius = radius;
}
public HitSuggest query(String word){
HitSuggest sugg = new HitSuggest();
if(root!=null)
this.root.query(metricSpace, word, radius, sugg);
return sugg;
}
public int put(String word,String term)
{
if(this.root == null)
this.root = new Node(word,term);
else
this.root.add(metricSpace,word, term);
return 0;
}
}
package com.brilliance.word.blacklist.struct;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import com.brilliance.word.blacklist.interf.IMetricSpace;
import com.brilliance.word.blacklist.search.HitSuggest;
public class Node {
public final String value;
public Map<Double,Node> children;
public Set<String> belong = new HashSet<String>();
public Node(String word,String term)
{
this.value = word;
belong.add(term);
}
/***
*
* @param mts to calculate distance
* @param term the new word
*/
public void add(IMetricSpace mts,String word,String term)
{
double dis = mts.distance(value, word);
if(dis == 0) //距离为0,表示相同,返回
{
belong.add(term); //添加所属语句
return ;
}
Node child=null;
if(this.children == null)
this.children = new HashMap<Double,Node>();
else
child = this.children.get(dis);
if(child == null) //为该距离的节点不存在,则添加新节点
children.put(dis,new Node(word,term));
else
child.add(mts, word,term);//已知距离点,添加至该节点
}
/***
*
* @param mts to calculate distance
* @param term term to match or query
* @param radius the max value of distance
* @param results the results
*/
public void query(IMetricSpace mts,String word,double radius,HitSuggest results){
double dis = mts.distance(this.value, word);
if(dis <= radius && dis < this.value.length())
{
results.addSuggest(this.value, dis,0);
results.addBelong(this.belong);
}
if(children == null)
return;
//三角条件推论
for(double i = Math.max(dis -radius, 1);i<= dis+radius;i++)
{
final Node child = children.get(i);
if(child != null)
{
child.query(mts, word, radius, results);
}
}
}
}
package com.brilliace.word.senstiveword;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}
package com.brilliace.word.senstiveword;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import com.brilliance.word.blacklist.BlackListShot0;
import com.brilliance.word.blacklist.search.Result;
public class TestMain {
public static void main(String[] args) throws FileNotFoundException
{
//加载黑名词
//查询黑名词
String words = "fu kai";
String term = "KIA ISa FU A good boy";
File path = Paths.get("data/frequency_dictionary_en_82_765.txt").toFile();
List<Result> rs = new ArrayList<Result>();
BlackListShot0 bs = new BlackListShot0();
long timebeg = System.currentTimeMillis();
//bs.loadWords(new StringReader(words));
bs.loadWords(new FileReader(path));
long timeend1 = System.currentTimeMillis();
System.out.println("load word:"+(timeend1-timebeg));
double percent = bs.shot(term,40, rs);
long timeend2 = System.currentTimeMillis();
System.out.println("shot:"+(timeend2-timeend1));
System.out.println(percent);
if(rs.size() > 20)
rs = rs.subList(0, 20);
for(Result rsitem :rs)
{
System.out.println(rsitem);
}
}
}
package com.brilliace.word.senstiveword;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class XMLAnalyzer2 {
public static void main(String[] args) throws SAXException, IOException, ParserConfigurationException {
// TODO Auto-generated method stub
XmlParseHandler handler = new XmlParseHandler();
FileInputStream fis = new FileInputStream("/Users/fukai/Downloads/Factiva_PFA_Feed_XML/PFA2_201708312200_F.xml");
// 1. 得到SAX解析工厂
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// 2. 让工厂生产一个sax解析器
SAXParser newSAXParser = saxParserFactory.newSAXParser();
// 3. 传入输入流和handler,解析
newSAXParser.parse(fis, handler);
fis.close();
}
}
class XmlParseHandler extends DefaultHandler
{
BufferedWriter bw = null;
boolean begin = false;
public XmlParseHandler() throws IOException{
bw = new BufferedWriter(new FileWriter("/Volumes/FUKAI/1234.txt"));
}
@Override
public void endDocument() throws SAXException {
// TODO Auto-generated method stub
try {
bw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
// TODO Auto-generated method stub
if("Records".equals(qName))
{
begin = false;
}
}
@Override
public void startDocument() throws SAXException {
// TODO Auto-generated method stub
super.startDocument();
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
// TODO Auto-generated method stub
// System.out.println(qName);
if("Records".equals(qName))
{
begin = true;
System.out.println("SanctionsReferencesList begin");
return ;
}
if(begin)
{
String name = attributes.getValue("name");
try {
bw.write(name+"\r\n");
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
\ No newline at end of file
Manifest-Version: 1.0
Built-By: fukai
Build-Jdk: 1.8.0_121
Created-By: Maven Integration for Eclipse
#Generated by Maven Integration for Eclipse
#Thu Nov 01 15:38:01 CST 2018
version=0.0.1-SNAPSHOT
groupId=com.brilliace.word
m2e.projectName=senstiveword
m2e.projectLocation=/Users/fukai/Documents/workspace/senstiveword
artifactId=senstiveword
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.brilliace.word</groupId>
<artifactId>senstiveword</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>senstiveword</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>sax</groupId>
<artifactId>sax</artifactId>
<version>2.0.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<finalName>senstiveword</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
#Generated by Maven
#Mon Sep 17 16:17:22 CST 2018
version=0.0.1-SNAPSHOT
groupId=com.brilliace.word
artifactId=senstiveword
com/brilliance/word/blacklist/BlackListDic.class
com/brilliance/word/blacklist/search/BlackSearh$1.class
com/brilliance/word/blacklist/search/BlackSearh$1$1.class
com/brilliance/word/blacklist/extend/simple/SimpleWordWalker.class
com/brilliance/word/blacklist/BlackListShot0.class
com/brilliance/word/blacklist/struct/BKTree.class
com/brilliance/word/blacklist/TestMain.class
SymSpell/EditDistance$DistanceAlgorithm.class
com/brilliance/word/blacklist/search/Result.class
com/brilliance/word/blacklist/algorithm/Distance2.class
com/brilliance/word/blacklist/Utils.class
SymSpell/SuggestionStage$Node.class
com/brilliance/word/blacklist/search/HitSuggestItem.class
SymSpell/ChunkArray.class
SymSpell/SuggestionStage$Entry.class
SymSpell/SymSpellDemo.class
com/brilliance/word/blacklist/TestMain2.class
com/brilliance/word/blacklist/algorithm/EditDistance$DistanceAlgorithm.class
com/brilliance/word/blacklist/struct/Node.class
com/brilliance/word/blacklist/interf/IWordListener.class
com/brilliance/word/blacklist/interf/IWordStore.class
com/brilliance/word/blacklist/search/HitSuggest.class
SymSpell/SuggestItem.class
com/brilliance/word/blacklist/algorithm/DamerauEditDistance.class
com/brilliance/word/blacklist/BlackListShot0$1.class
SymSpell/EditDistance$1.class
com/brilliance/word/blacklist/search/HitWord.class
com/brilliance/word/blacklist/BlackListShot$1.class
com/brilliance/word/blacklist/interf/IWordBKTreeMaker.class
com/brilliance/word/blacklist/interf/IMetricSpace.class
com/brilliance/word/blacklist/search/BlackSearh.class
com/brilliance/word/blacklist/search/HitWordGroup.class
com/brilliance/word/blacklist/extend/simple/SimpleBKTreeMaker$1.class
com/brilliance/word/blacklist/extend/simple/SimpleBKTreeMaker.class
SymSpell/SymSpellQuery.class
SymSpell/SymSpell.class
SymSpell/SymSpell$Verbosity.class
com/brilliance/word/blacklist/BlackListShot$2.class
SymSpell/SuggestionStage.class
com/brilliance/word/blacklist/interf/IWordWalker.class
com/brilliance/word/blacklist/BlackListShot.class
com/brilliance/word/blacklist/algorithm/EditDistance$1.class
com/brilliance/word/blacklist/algorithm/EditDistance.class
com/brilliance/word/blacklist/interf/IQuery.class
SymSpell/EditDistance.class
SymSpell/SymSpell$1.class
com/brilliace/word/senstiveword/XmlParseHandler.class
com/brilliace/word/senstiveword/TestMain.class
com/brilliace/word/senstiveword/AppTest.class
com/brilliace/word/senstiveword/XMLAnalyzer2.class
<?xml version="1.0" encoding="UTF-8" ?>
<testsuite tests="1" failures="0" name="com.brilliace.word.senstiveword.AppTest" time="0.004" errors="0" skipped="0">
<properties>
<property name="java.runtime.name" value="Java(TM) SE Runtime Environment"/>
<property name="sun.boot.library.path" value="/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib"/>
<property name="java.vm.version" value="25.121-b13"/>
<property name="gopherProxySet" value="false"/>
<property name="java.vm.vendor" value="Oracle Corporation"/>
<property name="maven.multiModuleProjectDirectory" value="/Users/fukai/Documents/workspace/senstiveword"/>
<property name="java.vendor.url" value="http://java.oracle.com/"/>
<property name="path.separator" value=":"/>
<property name="guice.disable.misplaced.annotation.check" value="true"/>
<property name="java.vm.name" value="Java HotSpot(TM) 64-Bit Server VM"/>
<property name="file.encoding.pkg" value="sun.io"/>
<property name="user.country" value="CN"/>
<property name="sun.java.launcher" value="SUN_STANDARD"/>
<property name="sun.os.patch.level" value="unknown"/>
<property name="java.vm.specification.name" value="Java Virtual Machine Specification"/>
<property name="user.dir" value="/Users/fukai/Documents/workspace/senstiveword"/>
<property name="java.runtime.version" value="1.8.0_121-b13"/>
<property name="java.awt.graphicsenv" value="sun.awt.CGraphicsEnvironment"/>
<property name="java.endorsed.dirs" value="/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/endorsed"/>
<property name="os.arch" value="x86_64"/>
<property name="java.io.tmpdir" value="/var/folders/pd/hbf3q9xx6fq6p9j1nn7kx0xw0000gn/T/"/>
<property name="line.separator" value="
"/>
<property name="java.vm.specification.vendor" value="Oracle Corporation"/>
<property name="os.name" value="Mac OS X"/>
<property name="classworlds.conf" value="/Users/fukai/Documents/workspace/.metadata/.plugins/org.eclipse.m2e.launching/launches/m2conf8504987481171531513.tmp"/>
<property name="sun.jnu.encoding" value="UTF-8"/>
<property name="java.library.path" value="/Users/fukai/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:."/>
<property name="java.specification.name" value="Java Platform API Specification"/>
<property name="java.class.version" value="52.0"/>
<property name="sun.management.compiler" value="HotSpot 64-Bit Tiered Compilers"/>
<property name="os.version" value="10.13.6"/>
<property name="user.home" value="/Users/fukai"/>
<property name="user.timezone" value="Asia/Shanghai"/>
<property name="java.awt.printerjob" value="sun.lwawt.macosx.CPrinterJob"/>
<property name="java.specification.version" value="1.8"/>
<property name="file.encoding" value="UTF-8"/>
<property name="user.name" value="fukai"/>
<property name="java.class.path" value="/Users/fukai/eclipse/jee-neon/Eclipse.app/Contents/Eclipse/../../../../../.p2/pool/plugins/org.eclipse.m2e.maven.runtime_1.7.0.20160603-1931/jars/plexus-classworlds-2.5.2.jar"/>
<property name="java.vm.specification.version" value="1.8"/>
<property name="sun.arch.data.model" value="64"/>
<property name="java.home" value="/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre"/>
<property name="sun.java.command" value="org.codehaus.plexus.classworlds.launcher.Launcher -B -gs /Users/fukai/apache-maven-3.3.9/conf/settings.xml -s /Users/fukai/apache-maven-3.3.9/conf/settings.xml install"/>
<property name="java.specification.vendor" value="Oracle Corporation"/>
<property name="user.language" value="zh"/>
<property name="awt.toolkit" value="sun.lwawt.macosx.LWCToolkit"/>
<property name="java.vm.info" value="mixed mode"/>
<property name="java.version" value="1.8.0_121"/>
<property name="java.ext.dirs" value="/Users/fukai/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java"/>
<property name="sun.boot.class.path" value="/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_121.jdk/Contents/Home/jre/classes"/>
<property name="java.vendor" value="Oracle Corporation"/>
<property name="maven.home" value="/Users/fukai/Documents/workspace/senstiveword/EMBEDDED"/>
<property name="file.separator" value="/"/>
<property name="java.vendor.url.bug" value="http://bugreport.sun.com/bugreport/"/>
<property name="sun.cpu.endian" value="little"/>
<property name="sun.io.unicode.encoding" value="UnicodeBig"/>
<property name="sun.cpu.isalist" value=""/>
</properties>
<testcase classname="com.brilliace.word.senstiveword.AppTest" name="testApp" time="0.004"/>
</testsuite>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment