Commit f99b6693 by fukai

提交黑名单修改

parent 9d66d59c
......@@ -14,7 +14,7 @@ import com.brilliance.word.blacklist.search.HitSuggest;
public class SymSpellQuery implements IQuery,IWordStore{
SymSpell symSpell ;
Map<String,Object[]> wordsMap = new HashMap<String,Object[]>(2000);
Map<String,Object[]> wordsMap = new HashMap<String,Object[]>(2048);
SuggestionStage suggstage = new SuggestionStage(16384);
int maxEditDistance = 3;
public SymSpellQuery(int maxEditDistance)
......@@ -25,20 +25,20 @@ public class SymSpellQuery implements IQuery,IWordStore{
@Override
public int put(String word, String term) {
public int put(String word, String termID) {
// TODO Auto-generated method stub
Object[] arr = wordsMap.get(word);
if(arr != null)
{
((Set<String>) arr[0]).add(term);
((Set<String>) arr[0]).add(termID);
return (Integer) arr[1];
}
Set<String> words = new HashSet<String>();
words.add(term);
int cnt = Utils.WORD_COUNTER ++ ;
wordsMap.put(word,new Object[]{ words,cnt});
symSpell.createDictionaryEntry(word, cnt, suggstage);
return cnt;
words.add(termID);
int wordID = Utils.WORD_COUNTER ++ ;
wordsMap.put(word,new Object[]{ words,wordID});
symSpell.createDictionaryEntry(word, wordID, suggstage);
return wordID;
}
public void commitStaged()
......
......@@ -6,13 +6,19 @@ import java.util.Map;
public class BlackListDic {
private Map<String,Integer[]> blackWords = new HashMap<String,Integer[]>();
private Map<String,String> words = new HashMap<String,String>();
/**
*
* @param term 新增黑名单词条
* @param id 新增黑名单词条
* @param integers
*/
public void addBlackTerm(String term,Integer[] integers){
blackWords.put(term, integers);
public void addBlackTerm(String id,Integer[] integers){
blackWords.put(id, integers);
}
public void addBlackWord(String id,String term)
{
words.put(id, term);
}
/**
* 删除黑名单词条
......@@ -34,4 +40,9 @@ public class BlackListDic {
{
return blackWords.get(term);
}
public String getTerm(String blackWord) {
// TODO Auto-generated method stub
return words.get(blackWord);
}
}
......@@ -33,16 +33,20 @@ public class BlackListShot {
{
final List<Integer> words = new ArrayList<Integer>();
SimpleWordWalker sww = new SimpleWordWalker();
int idIndex = line.indexOf('|');
int idIndex = line.lastIndexOf('|');
String copy = line;
String id = null;
if(idIndex > 0)
{
id = line.substring(0, idIndex);
copy = line.substring(idIndex+1); // 从第一个分隔符开始获取
copy = line.substring(0, idIndex).trim();
//标识符
id = line.substring(idIndex+1).trim(); // 从第一个分隔符开始获取
}
else
continue;
final String term = line;
final String ID = id;
final String ID = id+"@"+String.valueOf(Utils.TERM_COUNTER++); //考虑到词条主键不唯一,按照行号辅助做ID
sww.walk(new StringReader(copy),new IWordListener(){
@Override
......@@ -57,6 +61,7 @@ public class BlackListShot {
});
blklist.addBlackTerm(ID, words.toArray(new Integer[0]));
blklist.addBlackWord(ID, copy);
count++;
if(count > 500000)
break;
......
......@@ -13,9 +13,9 @@ public class TestMain2 {
public static void main(String[] args) throws FileNotFoundException {
// String term = "KIA ISa FU A good boy goosed 13178KIA ISa FU XXXXXXX YYYYYYY ZZZZZZZ A good boy goosed 13178 KIA ISa FU A good boy goosed 13178 Mr. Johnson had never been up in an aerophane before and he had read a lot about air accidents, so one day when a friend offered to take him for a ride in his own small phane, Mr. Johnson was very worried about accepting. Finally, however, his friend persuaded him that it was very safe, and Mr. Johnson boarded the plane.His friend started the engine and began to taxi onto the runway of the airport. Mr. Johnson had heard that the most dangerous part of a flight were the take-off and the landing, so he was extremely frightened and closed his eyes.  I still think of how foolish I must have looked, as I gazed at you, that first time. I remember watching you intently, as you took off your hat and loosely shook your short dark hair with your fingers. I felt myself becoming immersed in your every detail, as you placed your hat on the table and cupped your hands around the hot cup of tea, gently blowing the steam away with your pouted lips.";
String term = "Jean-Loris Bokassa is a good boy Jean-Loris Bokassa hahahhahahaha adaad adadad ada dada Jean-Loris Bokassa";
String term = "hcdhau Ogre de Berengo but" ;
System.out.println(term.length());
File path = Paths.get("data/id-name.txt").toFile();
File path = Paths.get("/Volumes/FUKAI/words/text/name").toFile();
List<Result> rs = new ArrayList<Result>();
BlackListShot bs = new BlackListShot();
......
......@@ -31,5 +31,6 @@ public class Utils {
}
public static ExecutorService executor = Executors.newCachedThreadPool();
public static int WORD_COUNTER= 11;
public static int TERM_COUNTER = 11;
public static final boolean WHOLD_WORD = true;
}
......@@ -37,7 +37,7 @@ public class HitWordGroup {
{
double degree = -1;
//与现有词组计算匹配度
Map<String,Integer> termCount = new HashMap<String,Integer>(30000); //计算语句Count
Map<String,Integer> termCount = new HashMap<String,Integer>(16384); //计算语句Count
for(HitWord hw : items)
{
for(String bitem:hw.item.belong)
......@@ -58,7 +58,7 @@ public class HitWordGroup {
final Integer[] keyWords = blklist.getKeyWords(term);
if(count < (keyWords.length -1 )*(threshold /100.0))
continue;
double ditem = fuzzyDegree(term,keyWords,rs);//单个匹配度
double ditem = fuzzyDegree(blklist,term,keyWords,rs);//单个匹配度
cnt++;
if(ditem > degree)
degree = ditem;
......@@ -69,7 +69,7 @@ public class HitWordGroup {
System.out.println("----shot count-----:"+this.shotCount);
return degree;
}
public double fuzzyDegree(String blackWord,Integer[] keyWords,List<Result> rs)
public double fuzzyDegree(BlackListDic blklist,String blackWord,Integer[] keyWords,List<Result> rs)
{
//计算矩阵
int m = keyWords.length;
......@@ -136,9 +136,10 @@ public class HitWordGroup {
Result rsitem = new Result();
rsitem.begpos = begPos;
rsitem.endpos = endPos;
rsitem.black = blackWord;
rsitem.black = blackWord.split("@")[0];
rsitem.shotdegree = sper;
rsitem.desstr = this.source.substring((int)begPos, (int)endPos);
rsitem.shotWord = blklist.getTerm(blackWord);
rs.add(rsitem);
}
if(sper > percent)
......@@ -273,7 +274,7 @@ public class HitWordGroup {
Result rsitem = new Result();
rsitem.begpos = begPos;
rsitem.endpos = endPos;
rsitem.black = blackWord;
rsitem.black = blackWord.split("@")[0];
rsitem.shotdegree = sper;
rsitem.desstr = this.source.substring((int)begPos, (int)endPos);
rs.add(rsitem);
......
......@@ -6,7 +6,8 @@ public class Result {
public long endpos;
public String black;
public String desstr;
public String shotWord;
public String toString(){
return String.format("%f#%s#%s", shotdegree,black,desstr);
return String.format("%f#%s#%s#%s", shotdegree,black,desstr,shotWord);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment