Luceneでクエリのオートコンプリート/提案を行う方法を探しています。私は少しグーグルで調べ、少し遊んでみましたが、私が見たすべての例はSolrでフィルターを設定しているようです。私たちはSolrを使用せず、近い将来Solrの使用に移行する予定もありません。Solrは明らかにLuceneをラップしているだけなので、それを行う方法があるはずです。
私はEdgeNGramFilterの使用を検討しましたが、インデックスフィールドでフィルターを実行してトークンを取得し、入力されたクエリと比較する必要があることに気付きました...私は単に接続するのに苦労していますこの2つを少しのコードにまとめると、ヘルプが高く評価されます!
私が探しているものを明確にするために(私はあまり明確ではないことに気づいた、申し訳ありません)-私は、用語を検索するときに、提案されたクエリのリストを返すソリューションを探しています。検索フィールドに「inter」と入力すると、「internet」、「international」などの推奨クエリのリストが返されます。
@Alexandre Victoorの答えに基づいて、contribパッケージのLucene Spellcheckerに基づいた小さなクラスを作成し(それに含まれるLuceneDictionaryを使用して)、まさに必要な処理を行います。
これにより、単一のフィールドを持つ単一のソースインデックスから再インデックス付けが可能になり、用語の提案が提供されます。結果は、元のインデックスでその用語と一致するドキュメントの数でソートされるため、より一般的な用語が最初に表示されます。かなりうまくいくようです:)
import Java.io.IOException;
import Java.io.Reader;
import Java.util.ArrayList;
import Java.util.HashMap;
import Java.util.Iterator;
import Java.util.List;
import Java.util.Map;
import org.Apache.lucene.analysis.Analyzer;
import org.Apache.lucene.analysis.ISOLatin1AccentFilter;
import org.Apache.lucene.analysis.LowerCaseFilter;
import org.Apache.lucene.analysis.StopFilter;
import org.Apache.lucene.analysis.TokenStream;
import org.Apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.Apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import org.Apache.lucene.analysis.standard.StandardFilter;
import org.Apache.lucene.analysis.standard.StandardTokenizer;
import org.Apache.lucene.document.Document;
import org.Apache.lucene.document.Field;
import org.Apache.lucene.index.CorruptIndexException;
import org.Apache.lucene.index.IndexReader;
import org.Apache.lucene.index.IndexWriter;
import org.Apache.lucene.index.Term;
import org.Apache.lucene.search.IndexSearcher;
import org.Apache.lucene.search.Query;
import org.Apache.lucene.search.ScoreDoc;
import org.Apache.lucene.search.Sort;
import org.Apache.lucene.search.TermQuery;
import org.Apache.lucene.search.TopDocs;
import org.Apache.lucene.search.spell.LuceneDictionary;
import org.Apache.lucene.store.Directory;
import org.Apache.lucene.store.FSDirectory;
/**
* Search term auto-completer, works for single terms (so use on the last term
* of the query).
* <p>
* Returns more popular terms first.
*
* @author Mat Mannion, [email protected]
*/
public final class Autocompleter {
private static final String GRAMMED_WORDS_FIELD = "words";
private static final String SOURCE_Word_FIELD = "sourceWord";
private static final String COUNT_FIELD = "count";
private static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "i", "if", "in", "into", "is",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private final Directory autoCompleteDirectory;
private IndexReader autoCompleteReader;
private IndexSearcher autoCompleteSearcher;
public Autocompleter(String autoCompleteDir) throws IOException {
this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir,
null);
reOpenReader();
}
public List<String> suggestTermsFor(String term) throws IOException {
// get the top 5 terms for query
Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term));
Sort sort = new Sort(COUNT_FIELD, true);
TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort);
List<String> suggestions = new ArrayList<String>();
for (ScoreDoc doc : docs.scoreDocs) {
suggestions.add(autoCompleteReader.document(doc.doc).get(
SOURCE_Word_FIELD));
}
return suggestions;
}
@SuppressWarnings("unchecked")
public void reIndex(Directory sourceDirectory, String fieldToAutocomplete)
throws CorruptIndexException, IOException {
// build a dictionary (from the spell package)
IndexReader sourceReader = IndexReader.open(sourceDirectory);
LuceneDictionary dict = new LuceneDictionary(sourceReader,
fieldToAutocomplete);
// code from
// org.Apache.lucene.search.spell.SpellChecker.indexDictionary(
// Dictionary)
IndexReader.unlock(autoCompleteDirectory);
// use a custom analyzer so we can do EdgeNGramFiltering
IndexWriter writer = new IndexWriter(autoCompleteDirectory,
new Analyzer() {
public TokenStream tokenStream(String fieldName,
Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new ISOLatin1AccentFilter(result);
result = new StopFilter(result,
ENGLISH_STOP_WORDS);
result = new EdgeNGramTokenFilter(
result, Side.FRONT,1, 20);
return result;
}
}, true);
writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);
// go through every Word, storing the original Word (incl. n-grams)
// and the number of times it occurs
Map<String, Integer> wordsMap = new HashMap<String, Integer>();
Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
while (iter.hasNext()) {
String Word = iter.next();
int len = Word.length();
if (len < 3) {
continue; // too short we bail but "too long" is fine...
}
if (wordsMap.containsKey(Word)) {
throw new IllegalStateException(
"This should never happen in Lucene 2.3.2");
// wordsMap.put(Word, wordsMap.get(Word) + 1);
} else {
// use the number of documents this Word appears in
wordsMap.put(Word, sourceReader.docFreq(new Term(
fieldToAutocomplete, Word)));
}
}
for (String Word : wordsMap.keySet()) {
// ok index the Word
Document doc = new Document();
doc.add(new Field(SOURCE_Word_FIELD, Word, Field.Store.YES,
Field.Index.UN_TOKENIZED)); // orig term
doc.add(new Field(GRAMMED_WORDS_FIELD, Word, Field.Store.YES,
Field.Index.TOKENIZED)); // grammed
doc.add(new Field(COUNT_FIELD,
Integer.toString(wordsMap.get(Word)), Field.Store.NO,
Field.Index.UN_TOKENIZED)); // count
writer.addDocument(doc);
}
sourceReader.close();
// close writer
writer.optimize();
writer.close();
// re-open our reader
reOpenReader();
}
private void reOpenReader() throws CorruptIndexException, IOException {
if (autoCompleteReader == null) {
autoCompleteReader = IndexReader.open(autoCompleteDirectory);
} else {
autoCompleteReader.reopen();
}
autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
}
public static void main(String[] args) throws Exception {
Autocompleter autocomplete = new Autocompleter("/index/autocomplete");
// run this to re-index from the current index, shouldn't need to do
// this very often
// autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null),
// "content");
String term = "steve";
System.out.println(autocomplete.suggestTermsFor(term));
// prints [steve, steven, stevens, stevenson, stevenage]
}
}
以下に、Matの実装をLucene.NETのC#に音訳し、jQueryのオートコンプリート機能を使用してテキストボックスを配線するためのスニペットを示します。
<input id="search-input" name="query" placeholder="Search database." type="text" />
... JQueryオートコンプリート:
// don't navigate away from the field when pressing tab on a selected item
$( "#search-input" ).keydown(function (event) {
if (event.keyCode === $.ui.keyCode.TAB && $(this).data("autocomplete").menu.active) {
event.preventDefault();
}
});
$( "#search-input" ).autocomplete({
source: '@Url.Action("SuggestTerms")', // <-- ASP.NET MVC Razor syntax
minLength: 2,
delay: 500,
focus: function () {
// prevent value inserted on focus
return false;
},
select: function (event, ui) {
var terms = this.value.split(/\s+/);
terms.pop(); // remove dropdown item
terms.Push(ui.item.value.trim()); // add completed item
this.value = terms.join(" ");
return false;
},
});
... ASP.NET MVC Controllerコードは次のとおりです。
//
// GET: /MyApp/SuggestTerms?term=something
public JsonResult SuggestTerms(string term)
{
if (string.IsNullOrWhiteSpace(term))
return Json(new string[] {});
term = term.Split().Last();
// Fetch suggestions
string[] suggestions = SearchSvc.SuggestTermsFor(term).ToArray();
return Json(suggestions, JsonRequestBehavior.AllowGet);
}
...そして、C#でのMatのコードは次のとおりです。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Store;
using Lucene.Net.Index;
using Lucene.Net.Search;
using SpellChecker.Net.Search.Spell;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.NGram;
using Lucene.Net.Documents;
namespace Cipher.Services
{
/// <summary>
/// Search term auto-completer, works for single terms (so use on the last term of the query).
/// Returns more popular terms first.
/// <br/>
/// Author: Mat Mannion, [email protected]
/// <seealso cref="http://stackoverflow.com/questions/120180/how-to-do-query-auto-completion-suggestions-in-lucene"/>
/// </summary>
///
public class SearchAutoComplete {
public int MaxResults { get; set; }
private class AutoCompleteAnalyzer : Analyzer
{
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
{
TokenStream result = new StandardTokenizer(kLuceneVersion, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new ASCIIFoldingFilter(result);
result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
result = new EdgeNGramTokenFilter(
result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE,1, 20);
return result;
}
}
private static readonly Lucene.Net.Util.Version kLuceneVersion = Lucene.Net.Util.Version.LUCENE_29;
private static readonly String kGrammedWordsField = "words";
private static readonly String kSourceWordField = "sourceWord";
private static readonly String kCountField = "count";
private static readonly String[] kEnglishStopWords = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "i", "if", "in", "into", "is",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private readonly Directory m_directory;
private IndexReader m_reader;
private IndexSearcher m_searcher;
public SearchAutoComplete(string autoCompleteDir) :
this(FSDirectory.Open(new System.IO.DirectoryInfo(autoCompleteDir)))
{
}
public SearchAutoComplete(Directory autoCompleteDir, int maxResults = 8)
{
this.m_directory = autoCompleteDir;
MaxResults = maxResults;
ReplaceSearcher();
}
/// <summary>
/// Find terms matching the given partial Word that appear in the highest number of documents.</summary>
/// <param name="term">A Word or part of a Word</param>
/// <returns>A list of suggested completions</returns>
public IEnumerable<String> SuggestTermsFor(string term)
{
if (m_searcher == null)
return new string[] { };
// get the top terms for query
Query query = new TermQuery(new Term(kGrammedWordsField, term.ToLower()));
Sort sort = new Sort(new SortField(kCountField, SortField.INT));
TopDocs docs = m_searcher.Search(query, null, MaxResults, sort);
string[] suggestions = docs.ScoreDocs.Select(doc =>
m_reader.Document(doc.Doc).Get(kSourceWordField)).ToArray();
return suggestions;
}
/// <summary>
/// Open the index in the given directory and create a new index of Word frequency for the
/// given index.</summary>
/// <param name="sourceDirectory">Directory containing the index to count words in.</param>
/// <param name="fieldToAutocomplete">The field in the index that should be analyzed.</param>
public void BuildAutoCompleteIndex(Directory sourceDirectory, String fieldToAutocomplete)
{
// build a dictionary (from the spell package)
using (IndexReader sourceReader = IndexReader.Open(sourceDirectory, true))
{
LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete);
// code from
// org.Apache.lucene.search.spell.SpellChecker.indexDictionary(
// Dictionary)
//IndexWriter.Unlock(m_directory);
// use a custom analyzer so we can do EdgeNGramFiltering
var analyzer = new AutoCompleteAnalyzer();
using (var writer = new IndexWriter(m_directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED))
{
writer.MergeFactor = 300;
writer.SetMaxBufferedDocs(150);
// go through every Word, storing the original Word (incl. n-grams)
// and the number of times it occurs
foreach (string Word in dict)
{
if (Word.Length < 3)
continue; // too short we bail but "too long" is fine...
// ok index the Word
// use the number of documents this Word appears in
int freq = sourceReader.DocFreq(new Term(fieldToAutocomplete, Word));
var doc = MakeDocument(fieldToAutocomplete, Word, freq);
writer.AddDocument(doc);
}
writer.Optimize();
}
}
// re-open our reader
ReplaceSearcher();
}
private static Document MakeDocument(String fieldToAutocomplete, string Word, int frequency)
{
var doc = new Document();
doc.Add(new Field(kSourceWordField, Word, Field.Store.YES,
Field.Index.NOT_ANALYZED)); // orig term
doc.Add(new Field(kGrammedWordsField, Word, Field.Store.YES,
Field.Index.ANALYZED)); // grammed
doc.Add(new Field(kCountField,
frequency.ToString(), Field.Store.NO,
Field.Index.NOT_ANALYZED)); // count
return doc;
}
private void ReplaceSearcher()
{
if (IndexReader.IndexExists(m_directory))
{
if (m_reader == null)
m_reader = IndexReader.Open(m_directory, true);
else
m_reader.Reopen();
m_searcher = new IndexSearcher(m_reader);
}
else
{
m_searcher = null;
}
}
}
}
lucene 4.2に基づく私のコード、あなたを助けるかもしれません
import Java.io.File;
import Java.io.IOException;
import org.Apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.Apache.lucene.index.DirectoryReader;
import org.Apache.lucene.index.IndexWriterConfig;
import org.Apache.lucene.index.IndexWriterConfig.OpenMode;
import org.Apache.lucene.search.spell.Dictionary;
import org.Apache.lucene.search.spell.LuceneDictionary;
import org.Apache.lucene.search.spell.PlainTextDictionary;
import org.Apache.lucene.search.spell.SpellChecker;
import org.Apache.lucene.store.Directory;
import org.Apache.lucene.store.FSDirectory;
import org.Apache.lucene.store.IOContext;
import org.Apache.lucene.store.RAMDirectory;
import org.Apache.lucene.util.Version;
import org.wltea4pinyin.analyzer.lucene.IKAnalyzer4PinYin;
/**
*
*
* @author <a href="mailto:[email protected]"></a>
* @version 2013-11-25上午11:13:59
*/
public class LuceneSpellCheckerDemoService {
private static final String INDEX_FILE = "/Users/r/Documents/jar/luke/youtui/index";
private static final String INDEX_FILE_SPELL = "/Users/r/Documents/jar/luke/spell";
private static final String INDEX_FIELD = "app_name_quanpin";
public static void main(String args[]) {
try {
//
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new IKAnalyzer4PinYin(
true));
// read index conf
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_42, wrapper);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
// read dictionary
Directory directory = FSDirectory.open(new File(INDEX_FILE));
RAMDirectory ramDir = new RAMDirectory(directory, IOContext.READ);
DirectoryReader indexReader = DirectoryReader.open(ramDir);
Dictionary dic = new LuceneDictionary(indexReader, INDEX_FIELD);
SpellChecker sc = new SpellChecker(FSDirectory.open(new File(INDEX_FILE_SPELL)));
//sc.indexDictionary(new PlainTextDictionary(new File("myfile.txt")), conf, false);
sc.indexDictionary(dic, conf, true);
String[] strs = sc.suggestSimilar("zhsiwusdazhanjiangshi", 10);
for (int i = 0; i < strs.length; i++) {
System.out.println(strs[i]);
}
sc.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
「辞書」インデックスでクラスPrefixQueryを使用できます。クラスLuceneDictionaryも役立ちます。
以下にリンクされているこの記事をご覧ください。 「どういう意味ですか?」という機能を実装する方法を説明しています。 Googleなどの最新の検索エンジンで利用できます。この記事で説明されているような複雑なものは必要ないかもしれません。ただし、この記事ではLuceneスペルパッケージの使用方法について説明しています。
「辞書」インデックスを作成する1つの方法は、LuceneDictionaryを反復処理することです。
それが役に立てば幸い
上記の(非常に高く評価されている)ポストre:c#変換に加えて、.NET 3.5を使用する場合は、EdgeNGramTokenFilterのコードを含める必要があります-少なくとも私はLucene 2.9.2を使用して-このフィルターがありません私が知る限り、.NETバージョンから。 2.9.3でオンラインで.NET 4バージョンを見つけてポートバックしなければなりませんでした-これにより手順が誰かにとって苦痛が少なくなることを願っています...
編集:SuggestTermsFor()関数によって返される配列は、カウントの昇順でソートされていることに注意してください。リスト内で最も人気のある用語を最初に取得するには、おそらくこれを逆にする必要があります
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.NGram
{
/**
* Tokenizes the given token into n-grams of given size(s).
* <p>
* This {@link TokenFilter} create n-grams from the beginning Edge or ending Edge of a input token.
* </p>
*/
public class EdgeNGramTokenFilter : TokenFilter
{
public static Side DEFAULT_SIDE = Side.FRONT;
public static int DEFAULT_MAX_GRAM_SIZE = 1;
public static int DEFAULT_MIN_GRAM_SIZE = 1;
// Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
/** Specifies which side of the input the n-gram should be generated from */
public class Side
{
private string label;
/** Get the n-gram from the front of the input */
public static Side FRONT = new Side("front");
/** Get the n-gram from the end of the input */
public static Side BACK = new Side("back");
// Private ctor
private Side(string label) { this.label = label; }
public string getLabel() { return label; }
// Get the appropriate Side from a string
public static Side getSide(string sideName)
{
if (FRONT.getLabel().Equals(sideName))
{
return FRONT;
}
else if (BACK.getLabel().Equals(sideName))
{
return BACK;
}
return null;
}
}
private int minGram;
private int maxGram;
private Side side;
private char[] curTermBuffer;
private int curTermLength;
private int curGramSize;
private int tokStart;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
protected EdgeNGramTokenFilter(TokenStream input) : base(input)
{
this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
}
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param side the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
: base(input)
{
if (side == null)
{
throw new System.ArgumentException("sideLabel must be either front or back");
}
if (minGram < 1)
{
throw new System.ArgumentException("minGram must be greater than zero");
}
if (minGram > maxGram)
{
throw new System.ArgumentException("minGram must not be greater than maxGram");
}
this.minGram = minGram;
this.maxGram = maxGram;
this.side = side;
this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
}
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
: this(input, Side.getSide(sideLabel), minGram, maxGram)
{
}
public override bool IncrementToken()
{
while (true)
{
if (curTermBuffer == null)
{
if (!input.IncrementToken())
{
return false;
}
else
{
curTermBuffer = (char[])termAtt.TermBuffer().Clone();
curTermLength = termAtt.TermLength();
curGramSize = minGram;
tokStart = offsetAtt.StartOffset();
}
}
if (curGramSize <= maxGram)
{
if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
|| curGramSize > maxGram))
{ // if we have hit the end of our n-gram size range, quit
// grab gramSize chars from front or back
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
int end = start + curGramSize;
ClearAttributes();
offsetAtt.SetOffset(tokStart + start, tokStart + end);
termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
curGramSize++;
return true;
}
}
curTermBuffer = null;
}
}
public override Token Next(Token reusableToken)
{
return base.Next(reusableToken);
}
public override Token Next()
{
return base.Next();
}
public override void Reset()
{
base.Reset();
curTermBuffer = null;
}
}
}