Lucene中的跨度查询示例

作者: kunshan_shenbin
发布时间:2015-07-08 16:57:10

信息源自:http://callan.javaeye.com/blog/154251

SpanQuery按照词在文章中的距离或者查询几个相邻词的查询

SpanQuery包括以下几种:

SpanTermQuery:词距查询的基础,结果和TermQuery相似,只不过是增加了查询结果中单词的距离信息。

SpanFirstQuery:在指定距离可以找到第一个单词的查询。

SpanNearQuery:查询的几个语句之间保持者一定的距离。

SpanOrQuery:同时查询几个词句查询。

SpanNotQuery:从一个词距查询结果中,去除一个词距查询。

示例代码如下:

package com.lucene.search;    //SpanQuery:跨度查询。此类为抽象类。  import java.io.IOException;  import java.io.StringReader;  import java.util.ArrayList;  import java.util.List;  import org.Apache.lucene.analysis.Analyzer;  import org.apache.lucene.analysis.Token;  import org.apache.lucene.analysis.TokenStream;  import org.apache.lucene.analysis.WhitespaceAnalyzer;  import org.apache.lucene.document.Document;  import org.apache.lucene.document.Field;  import org.apache.lucene.document.Field.Index;  import org.apache.lucene.document.Field.Store;  import org.apache.lucene.index.IndexReader;  import org.apache.lucene.index.IndexWriter;  import org.apache.lucene.index.Term;  import org.apache.lucene.search.Hits;  import org.apache.lucene.search.IndexSearcher;  import org.apache.lucene.search.spans.SpanFirstQuery;  import org.apache.lucene.search.spans.SpanNearQuery;  import org.apache.lucene.search.spans.SpanNotQuery;  import org.apache.lucene.search.spans.SpanOrQuery;  import org.apache.lucene.search.spans.SpanQuery;  import org.apache.lucene.search.spans.SpanTermQuery;  import org.apache.lucene.search.spans.Spans;  import org.apache.lucene.store.RAMDirectory;    public class SpanQueryTest {    	private RAMDirectory directory;  	private IndexSearcher indexSearcher;  	private IndexReader reader;  	private SpanTermQuery quick;  	private SpanTermQuery brown;  	private SpanTermQuery red;  	private SpanTermQuery fox;  	private SpanTermQuery lazy;  	private SpanTermQuery sleepy;  	private SpanTermQuery dog;  	private SpanTermQuery cat;  	private Analyzer analyzer;    	// 索引及初使化  	public void index() throws IOException {    		directory = new RAMDirectory();  		analyzer = new WhitespaceAnalyzer();  		IndexWriter writer = new IndexWriter(directory, analyzer, true);  		Document doc1 = new Document();  		doc1.add(new Field("field", "the quick brown fox jumps over the lazy dog", Store.YES, Index.TOKENIZED));  		Document doc2 = new Document();  		doc2.add(new Field("field", "the quick red fox jumps over the sleepy cat", Store.YES, Index.TOKENIZED));  		writer.addDocument(doc1);  		writer.addDocument(doc2);  		writer.optimize();  		writer.close();    		quick = new SpanTermQuery(new Term("field", "quick"));  		brown = new SpanTermQuery(new Term("field", "brown"));  		red = new SpanTermQuery(new Term("field", "red"));  		fox = new SpanTermQuery(new Term("field", "fox"));  		lazy = new SpanTermQuery(new Term("field", "lazy"));  		sleepy = new SpanTermQuery(new Term("field", "sleepy"));  		dog = new SpanTermQuery(new Term("field", "dog"));  		cat = new SpanTermQuery(new Term("field", "cat"));  		indexSearcher = new IndexSearcher(directory);  		reader = IndexReader.open(directory);  	}    	private void dumpSpans(SpanQuery query) throws IOException {    		// 检索效果和TermQuery一样,可以把他当成TermQuery  		Hits hits = indexSearcher.search(query);  		for (int i = 0; i < hits.length(); i++) {  			// System.out.println(hits.doc(i).get("field"));  		}    		// 但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。  		Spans spans = query.getSpans(reader);  		int numSpans = 0;  		float[] scores = new float[2];  		for (int i = 0; i < hits.length(); i++) {  			scores[hits.id(i)] = hits.score(i);  		}    		while (spans.next()) {  			numSpans++;  			int id = spans.doc();  			Document doc = reader.document(id);  			Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc.get("field"));  			StringBuffer buffer = new StringBuffer();  			for (int i = 0; i < tokens.length; i++) {  				// the quick brown fox jumps over the lazy dog  				// spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3  				// 在第二项的位置后加<,第三项后加> 返回  				if (i == spans.start()) {  					buffer.append("<");  				}  				buffer.append(tokens[i].termText());  				if (i + 1 == spans.end()) {  					buffer.append(">");  				}  				buffer.append(" ");  			}  			buffer.append("(" + scores[id] + ") ");  			System.out.println(buffer);  		}  		// indexSearcher.close();  	}    	// SpanTermQuery:检索效果完全同TermQuery,但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。  	public void spanTermQueryTest() throws IOException {  		dumpSpans(brown);    		//// 搜索结果  		// the quick  fox jumps over the lazy dog (0.22097087)   	}    	// SpanFirstQuery:查找方式为从Field的内容起始位置开始,在一个固定的宽度内查找所指定的词条。  	public void spanFirstQueryTest() throws IOException {  		// the quick brown fox jumps over the lazy dog  		// 在给定的范围搜索,前两个为the quick  		// brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到  		SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3);  		dumpSpans(firstQuery);    		////搜索结果  		// the quick  fox jumps over the lazy dog (0.22097087)   	}    	// SpanNearQuery:功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语,还有可能是另一个SpanQuery的查询结果作为整体考虑,进行嵌套查询。  	public void spanNearQueryTest() throws IOException {  		// the quick brown fox jumps over the lazy dog    		// 第二个参数为两个项的位置之间允许的最大间隔  		// 在这里两个较远的项为quick和fox,他们之是的最大间隔为5,所以slop必须>=5才能搜到结果  		SpanNearQuery nearQuery = new SpanNearQuery(new SpanQuery[] { quick, brown, fox }, 5, true);    		dumpSpans(nearQuery);    		// 与PhraseQuery短语搜索相似  		// 这里搜索quick,dog,brown,要想得到结果,就要将brown向后移动5个位置才能到dog的后面,所以slop要>=5才能找到结果  		// 第三个参数,如果为true表示保持各项位置不变,顺序搜索  		nearQuery = new SpanNearQuery(new SpanQuery[] { quick, dog, brown }, 5, false);    		dumpSpans(nearQuery);    		//////搜索结果/////  		// 第一个dumpSpans的结果 the  jumps over the lazy dog (0.34204215)   		// 第二个dumpSpans的结果 the  (0.27026406)   	}    	// 从第一个SpanQuery查询结果中,去掉第二个SpanQuery查询结果,作为检索结果  	public void spanNotQueryTest() throws IOException {    		// the quick brown fox jumps over the lazy dog  		SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick, fox }, 1, true);    		// 结果为quick brown fox 和 quick red fox  		dumpSpans(quick_fox);    		// SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);  		// dumpSpans(quick_fox_dog);    		// 在quick_fox结果中,去掉red,结果为quick brown fox  		SpanNotQuery no_quick_red_fox = new SpanNotQuery(quick_fox, red);  		dumpSpans(no_quick_red_fox);    		//////搜索结果///////第一个dumpSpans结果为前两条,第二个dumpSpans结果为第三条  		//the  jumps over the lazy dog (0.18579213)   		//the  jumps over the sleepy cat (0.18579213)   		//the  jumps over the lazy dog (0.18579213)   	}    	// SpanOrQuery:把所有SpanQuery查询结果综合起来,作为检索结果。  	public void spanOrQueryTest() throws IOException {    		SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick, fox }, 1, true);  		SpanNearQuery lazy_dog = new SpanNearQuery(new SpanQuery[] { lazy, dog }, 0, true);  		SpanNearQuery sleepy_cat = new SpanNearQuery(new SpanQuery[] { sleepy, cat }, 0, true);  		SpanNearQuery qf_near_ld = new SpanNearQuery(new SpanQuery[] { quick_fox, lazy_dog }, 3, true);  		dumpSpans(qf_near_ld);  		SpanNearQuery qf_near_sc = new SpanNearQuery(new SpanQuery[] { quick_fox, sleepy_cat }, 3, true);  		dumpSpans(qf_near_sc);  		SpanOrQuery or = new SpanOrQuery(new SpanQuery[] { qf_near_ld, qf_near_sc });  		dumpSpans(or);    		/////////搜索结果 第一个dumpSpans结果为第一条,第二个为第二条,第三个为第三,四条  		// the  (0.3321948)   		// the  (0.3321948)   		// the  (0.5405281)   		// the  (0.5405281)   	}    	public static void main(String[] args) throws IOException {    		SpanQueryTest test = new SpanQueryTest();  		test.index();  		test.spanOrQueryTest();  	}  }    class AnalyzerUtils {  	  	public static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException {  		  		TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));  		boolean b = true;  		List list = new ArrayList();  		while (b) {  			Token token = stream.next();  			if (token == null) b = false; else list.add(token);  		}  		return (Token[]) list.toArray(new Token[0]);  	}  }

其他资源:http://blog.csdn.net/caoxu1987728/archive/2008/04/25/2328745.aspx

标签: Lucene
来源:http://blog.csdn.net/kunshan_shenbin/article/details/2524453

推荐: