Commit f99b6693 by fukai

提交黑名单修改

parent 9d66d59c
...@@ -14,7 +14,7 @@ import com.brilliance.word.blacklist.search.HitSuggest; ...@@ -14,7 +14,7 @@ import com.brilliance.word.blacklist.search.HitSuggest;
public class SymSpellQuery implements IQuery,IWordStore{ public class SymSpellQuery implements IQuery,IWordStore{
SymSpell symSpell ; SymSpell symSpell ;
Map<String,Object[]> wordsMap = new HashMap<String,Object[]>(2000); Map<String,Object[]> wordsMap = new HashMap<String,Object[]>(2048);
SuggestionStage suggstage = new SuggestionStage(16384); SuggestionStage suggstage = new SuggestionStage(16384);
int maxEditDistance = 3; int maxEditDistance = 3;
public SymSpellQuery(int maxEditDistance) public SymSpellQuery(int maxEditDistance)
...@@ -25,20 +25,20 @@ public class SymSpellQuery implements IQuery,IWordStore{ ...@@ -25,20 +25,20 @@ public class SymSpellQuery implements IQuery,IWordStore{
@Override @Override
public int put(String word, String term) { public int put(String word, String termID) {
// TODO Auto-generated method stub // TODO Auto-generated method stub
Object[] arr = wordsMap.get(word); Object[] arr = wordsMap.get(word);
if(arr != null) if(arr != null)
{ {
((Set<String>) arr[0]).add(term); ((Set<String>) arr[0]).add(termID);
return (Integer) arr[1]; return (Integer) arr[1];
} }
Set<String> words = new HashSet<String>(); Set<String> words = new HashSet<String>();
words.add(term); words.add(termID);
int cnt = Utils.WORD_COUNTER ++ ; int wordID = Utils.WORD_COUNTER ++ ;
wordsMap.put(word,new Object[]{ words,cnt}); wordsMap.put(word,new Object[]{ words,wordID});
symSpell.createDictionaryEntry(word, cnt, suggstage); symSpell.createDictionaryEntry(word, wordID, suggstage);
return cnt; return wordID;
} }
public void commitStaged() public void commitStaged()
......
...@@ -6,13 +6,19 @@ import java.util.Map; ...@@ -6,13 +6,19 @@ import java.util.Map;
public class BlackListDic { public class BlackListDic {
private Map<String,Integer[]> blackWords = new HashMap<String,Integer[]>(); private Map<String,Integer[]> blackWords = new HashMap<String,Integer[]>();
private Map<String,String> words = new HashMap<String,String>();
/** /**
* *
* @param term 新增黑名单词条 * @param id 新增黑名单词条
* @param integers * @param integers
*/ */
public void addBlackTerm(String term,Integer[] integers){ public void addBlackTerm(String id,Integer[] integers){
blackWords.put(term, integers); blackWords.put(id, integers);
}
public void addBlackWord(String id,String term)
{
words.put(id, term);
} }
/** /**
* 删除黑名单词条 * 删除黑名单词条
...@@ -34,4 +40,9 @@ public class BlackListDic { ...@@ -34,4 +40,9 @@ public class BlackListDic {
{ {
return blackWords.get(term); return blackWords.get(term);
} }
public String getTerm(String blackWord) {
// TODO Auto-generated method stub
return words.get(blackWord);
}
} }
...@@ -33,16 +33,20 @@ public class BlackListShot { ...@@ -33,16 +33,20 @@ public class BlackListShot {
{ {
final List<Integer> words = new ArrayList<Integer>(); final List<Integer> words = new ArrayList<Integer>();
SimpleWordWalker sww = new SimpleWordWalker(); SimpleWordWalker sww = new SimpleWordWalker();
int idIndex = line.indexOf('|'); int idIndex = line.lastIndexOf('|');
String copy = line; String copy = line;
String id = null; String id = null;
if(idIndex > 0) if(idIndex > 0)
{ {
id = line.substring(0, idIndex); copy = line.substring(0, idIndex).trim();
copy = line.substring(idIndex+1); // 从第一个分隔符开始获取 //标识符
id = line.substring(idIndex+1).trim(); // 从第一个分隔符开始获取
} }
else
continue;
final String term = line; final String term = line;
final String ID = id; final String ID = id+"@"+String.valueOf(Utils.TERM_COUNTER++); //考虑到词条主键不唯一,按照行号辅助做ID
sww.walk(new StringReader(copy),new IWordListener(){ sww.walk(new StringReader(copy),new IWordListener(){
@Override @Override
...@@ -57,6 +61,7 @@ public class BlackListShot { ...@@ -57,6 +61,7 @@ public class BlackListShot {
}); });
blklist.addBlackTerm(ID, words.toArray(new Integer[0])); blklist.addBlackTerm(ID, words.toArray(new Integer[0]));
blklist.addBlackWord(ID, copy);
count++; count++;
if(count > 500000) if(count > 500000)
break; break;
......
...@@ -13,9 +13,9 @@ public class TestMain2 { ...@@ -13,9 +13,9 @@ public class TestMain2 {
public static void main(String[] args) throws FileNotFoundException { public static void main(String[] args) throws FileNotFoundException {
// String term = "KIA ISa FU A good boy goosed 13178KIA ISa FU XXXXXXX YYYYYYY ZZZZZZZ A good boy goosed 13178 KIA ISa FU A good boy goosed 13178 Mr. Johnson had never been up in an aerophane before and he had read a lot about air accidents, so one day when a friend offered to take him for a ride in his own small phane, Mr. Johnson was very worried about accepting. Finally, however, his friend persuaded him that it was very safe, and Mr. Johnson boarded the plane.His friend started the engine and began to taxi onto the runway of the airport. Mr. Johnson had heard that the most dangerous part of a flight were the take-off and the landing, so he was extremely frightened and closed his eyes.  I still think of how foolish I must have looked, as I gazed at you, that first time. I remember watching you intently, as you took off your hat and loosely shook your short dark hair with your fingers. I felt myself becoming immersed in your every detail, as you placed your hat on the table and cupped your hands around the hot cup of tea, gently blowing the steam away with your pouted lips."; // String term = "KIA ISa FU A good boy goosed 13178KIA ISa FU XXXXXXX YYYYYYY ZZZZZZZ A good boy goosed 13178 KIA ISa FU A good boy goosed 13178 Mr. Johnson had never been up in an aerophane before and he had read a lot about air accidents, so one day when a friend offered to take him for a ride in his own small phane, Mr. Johnson was very worried about accepting. Finally, however, his friend persuaded him that it was very safe, and Mr. Johnson boarded the plane.His friend started the engine and began to taxi onto the runway of the airport. Mr. Johnson had heard that the most dangerous part of a flight were the take-off and the landing, so he was extremely frightened and closed his eyes.  I still think of how foolish I must have looked, as I gazed at you, that first time. I remember watching you intently, as you took off your hat and loosely shook your short dark hair with your fingers. I felt myself becoming immersed in your every detail, as you placed your hat on the table and cupped your hands around the hot cup of tea, gently blowing the steam away with your pouted lips.";
String term = "Jean-Loris Bokassa is a good boy Jean-Loris Bokassa hahahhahahaha adaad adadad ada dada Jean-Loris Bokassa"; String term = "hcdhau Ogre de Berengo but" ;
System.out.println(term.length()); System.out.println(term.length());
File path = Paths.get("data/id-name.txt").toFile(); File path = Paths.get("/Volumes/FUKAI/words/text/name").toFile();
List<Result> rs = new ArrayList<Result>(); List<Result> rs = new ArrayList<Result>();
BlackListShot bs = new BlackListShot(); BlackListShot bs = new BlackListShot();
......
...@@ -31,5 +31,6 @@ public class Utils { ...@@ -31,5 +31,6 @@ public class Utils {
} }
public static ExecutorService executor = Executors.newCachedThreadPool(); public static ExecutorService executor = Executors.newCachedThreadPool();
public static int WORD_COUNTER= 11; public static int WORD_COUNTER= 11;
public static int TERM_COUNTER = 11;
public static final boolean WHOLD_WORD = true; public static final boolean WHOLD_WORD = true;
} }
...@@ -37,7 +37,7 @@ public class HitWordGroup { ...@@ -37,7 +37,7 @@ public class HitWordGroup {
{ {
double degree = -1; double degree = -1;
//与现有词组计算匹配度 //与现有词组计算匹配度
Map<String,Integer> termCount = new HashMap<String,Integer>(30000); //计算语句Count Map<String,Integer> termCount = new HashMap<String,Integer>(16384); //计算语句Count
for(HitWord hw : items) for(HitWord hw : items)
{ {
for(String bitem:hw.item.belong) for(String bitem:hw.item.belong)
...@@ -58,7 +58,7 @@ public class HitWordGroup { ...@@ -58,7 +58,7 @@ public class HitWordGroup {
final Integer[] keyWords = blklist.getKeyWords(term); final Integer[] keyWords = blklist.getKeyWords(term);
if(count < (keyWords.length -1 )*(threshold /100.0)) if(count < (keyWords.length -1 )*(threshold /100.0))
continue; continue;
double ditem = fuzzyDegree(term,keyWords,rs);//单个匹配度 double ditem = fuzzyDegree(blklist,term,keyWords,rs);//单个匹配度
cnt++; cnt++;
if(ditem > degree) if(ditem > degree)
degree = ditem; degree = ditem;
...@@ -69,7 +69,7 @@ public class HitWordGroup { ...@@ -69,7 +69,7 @@ public class HitWordGroup {
System.out.println("----shot count-----:"+this.shotCount); System.out.println("----shot count-----:"+this.shotCount);
return degree; return degree;
} }
public double fuzzyDegree(String blackWord,Integer[] keyWords,List<Result> rs) public double fuzzyDegree(BlackListDic blklist,String blackWord,Integer[] keyWords,List<Result> rs)
{ {
//计算矩阵 //计算矩阵
int m = keyWords.length; int m = keyWords.length;
...@@ -136,9 +136,10 @@ public class HitWordGroup { ...@@ -136,9 +136,10 @@ public class HitWordGroup {
Result rsitem = new Result(); Result rsitem = new Result();
rsitem.begpos = begPos; rsitem.begpos = begPos;
rsitem.endpos = endPos; rsitem.endpos = endPos;
rsitem.black = blackWord; rsitem.black = blackWord.split("@")[0];
rsitem.shotdegree = sper; rsitem.shotdegree = sper;
rsitem.desstr = this.source.substring((int)begPos, (int)endPos); rsitem.desstr = this.source.substring((int)begPos, (int)endPos);
rsitem.shotWord = blklist.getTerm(blackWord);
rs.add(rsitem); rs.add(rsitem);
} }
if(sper > percent) if(sper > percent)
...@@ -273,7 +274,7 @@ public class HitWordGroup { ...@@ -273,7 +274,7 @@ public class HitWordGroup {
Result rsitem = new Result(); Result rsitem = new Result();
rsitem.begpos = begPos; rsitem.begpos = begPos;
rsitem.endpos = endPos; rsitem.endpos = endPos;
rsitem.black = blackWord; rsitem.black = blackWord.split("@")[0];
rsitem.shotdegree = sper; rsitem.shotdegree = sper;
rsitem.desstr = this.source.substring((int)begPos, (int)endPos); rsitem.desstr = this.source.substring((int)begPos, (int)endPos);
rs.add(rsitem); rs.add(rsitem);
......
...@@ -6,7 +6,8 @@ public class Result { ...@@ -6,7 +6,8 @@ public class Result {
public long endpos; public long endpos;
public String black; public String black;
public String desstr; public String desstr;
public String shotWord;
public String toString(){ public String toString(){
return String.format("%f#%s#%s", shotdegree,black,desstr); return String.format("%f#%s#%s#%s", shotdegree,black,desstr,shotWord);
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment