public class Indexer {
private IndexWriter writer;
private Analyzer analyzer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
analyzer = new SmartChineseAnalyzer(Version.LUCENE_35, true); // 中文分词
writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
}
// 建立索引
public void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
}
// 输出关键词文本内容
public void getTermText(String fieldName, String text) {
TokenStream stream = analyzer.reusableTokenStream(fieldName, new StringReader(text)); // 重用流,以便提速性能
CharTermAttribute charTerm = stream.addAttribute(CharTermAttribute.class); // 获取关键词
String word;
while(stream.incrementToken()) {
word = charTerm.toString();
System.out.print(word + " ");
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("filename", f.getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("fullpath", f.getCanonicalPath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;
}
}
imdict-chinese-analyzer - imdict智能词典所采用的智能中文分词程序
/p/imdict-chinese-analyzer/
这些应该对你有用。