LUCENE.NET使用探秘
对于满足全文检索的需求来说, Lucene.Net 无疑是一个很好的选择。它引入了增量索引的策略,解决了在数据频繁改动时重建索引的问题,这对于提高web的性能至关重要(其他相关特性大家可以参看官方文档)。Lucene.Net是基于文档性的全文搜索,所以使用Lucene.Net时要把数据库中的数据先导出来,这也是一个建立索引的过程。代码如下:
1 /// <summary>
2 /// Add Data into Indexes
3 /// </summary>
4 /// <param name="models"> Data collection </param>
5 /// <param name="optimize"> Whether to optimize the indexes after adding new indexes </param>
6 public void AddToSearchIndex(IEnumerable<T> models, bool optimize = false )
7 {
8 var analyzer = new StandardAnalyzer(Version.LUCENE_30);
9 using ( var writer = new IndexWriter(_directory,analyzer,IndexWriter.MaxFieldLength.UNLIMITED))
10 {
11 foreach ( var model in models)
12 {
13 // remove older index entry
14 var searchQuery = new TermQuery( new Term( " Id " , (model as dynamic).ID.ToString()));
16 writer.DeleteDocuments(searchQuery);
17
18 var doc = new Document();
19 foreach ( var prop in Props)
20 {
21 var value = prop.GetValue(model);
22 if (value == null )
23 {
24 continue ;
25 }
26 //only store ID,we use it to retrieve model data from DB
27 doc.Add( new Field(prop.Name, value.ToString(),
28 prop.Name == " ID " ? Field.Store.YES : Field.Store.NO,
29 Field.Index.ANALYZED));
30 }
31 writer.AddDocument(doc);
32 }
33 if (optimize)
34 {
35 writer.Optimize();
36 }
37 }
38 }
上述函数用于把到处的数据添加到索引文件中,我们可以指定是否在完成插入后优化索引。优化索引可以提高检索速度,但会消耗Cpu资源,不建议经常优化它。另外,我们在插入索引时会先检测时更新还是添加,这用于完成对旧数据的更新。那么,如果当数据库移除了一条记录,对于索引文件我们又该如何做呢?
和数据库操作类似,当从数据库移除记录时,从所以文件中移除相应记录即可,代码如下:
/// <summary>
/// Remove specfied index record
/// </summary>
/// <param name="record_id"> the record's ID </param>
public void ClearSearchIndex( int record_id)
{
var analyzer = new StandardAnalyzer(Version.LUCENE_30);
using ( var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
{
// remove older index entry
var searchQuery = new TermQuery( new Term( " ID " , record_id.ToString()));
writer.DeleteDocuments(searchQuery);
writer.Commit();
}
analyzer.Dispose();
}
同样,我们可以删除所有的索引记录
/// <summary>
/// Remove all index records
/// </summary>
/// <returns> whether operation success or not </returns>
public bool ClearAllSearchIndex()
{
StandardAnalyzer analyzer = null ;
try
{
analyzer = new StandardAnalyzer(Version.LUCENE_30);
using ( var writer = new IndexWriter(_directory, analyzer, true ,
IndexWriter.MaxFieldLength.UNLIMITED))
{
// remove older index entries
writer.DeleteAll();
writer.Commit();
}
analyzer.Dispose();
}
catch (Exception)
{
analyzer.Dispose();
return false ;
}
return true ;
}
下面该主角登场了,看看如何检索记录吧:
/// <summary>
/// Searching specfied value in all fields,or you can specfied a field to search in.
/// </summary>
/// <param name="querystring"> value to search </param>
/// <param name="fieldname"> field to search, search all fieds at default </param>
/// <returns> realted records' ID sequence </returns>
public IEnumerable< int > Search( string querystring, string fieldname = "" )
{
IEnumerable < int > result = new List< int > ();
if ( string .IsNullOrEmpty(querystring))
{
return new List< int > ();
}
// remove invalid characters
querystring = ParseSearchString(querystring);
// validation
if ( string .IsNullOrEmpty(querystring.Replace( " * " , "" ).Replace( " ? " , "" )))
{
return new List< int > ();
}
using ( var searcher = new IndexSearcher(_directory, true ))
{
ScoreDoc[] hits = null ;
// the max hited racord count
var hits_limit = 1000 ;
var analyzer = new StandardAnalyzer(Version.LUCENE_30);
// used to separate the querystring to match records in indexes
QueryParser parser = null ;
Query query = null ;
if (! string .IsNullOrEmpty(fieldname))
{
// create a QueryParser instance in the specified field
parser = new QueryParser(Version.LUCENE_30, fieldname, analyzer);
}
else
{
string [] fields = Props.Select(p => p.Name).ToArray< string > ();
// create a QueryParser instance in the all fields
parser = new MultiFieldQueryParser(Version.LUCENE_30, fields, analyzer);
}
// create a query instance from QueryParser and querystring
query = ParseQuery(querystring, parser);
// get the hited record
hits = searcher.Search(query, hits_limit).ScoreDocs;
var resultDocs = hits.Select(hit => searcher.Doc(hit.Doc));
// transmit the index record's ID to the DB record's ID
result = resultDocs.
Select(doc => ((SpecEquipmentID) int .Parse(doc.Get( " ID " ))).CurrentID).
ToList();
analyzer.Dispose();
}
return result;
}
从上述可以看出,我们可以指定在若干字段间搜索,这些字段间的检索同样可采用模糊检索的模式:
public IEnumerable< int > MultiFieldsSearch(Dictionary< string , string > multiFieldsDict)
{
IEnumerable < int > result = new List< int > ();
if (multiFieldsDict.Count == 0 )
{
return result;
}
using ( var searcher = new IndexSearcher(_directory, true ))
{
ScoreDoc[] hits = null ;
var hits_limit = 1000 ;
var analyzer = new StandardAnalyzer(Version.LUCENE_30);
var occurs = ( from field in multiFieldsDict.Keys select Occur.MUST).ToArray();
var queries = ( from key in multiFieldsDict.Keys select multiFieldsDict[key]).ToArray();
Query query = MultiFieldQueryParser.Parse(Version.LUCENE_30, queries,
multiFieldsDict.Keys.ToArray(), occurs, analyzer);
hits = searcher.Search(query, hits_limit).ScoreDocs;
var resultDocs = hits.Select(hit => searcher.Doc(hit.Doc));
result = resultDocs.
Select(doc => ((SpecEquipmentID) int .Parse(doc.Get( " ID " ))).CurrentID).
Distinct().ToList();
analyzer.Dispose();
}
return result;
}
在这里解释下: 为什么用QueryParser生成Query的实例?
使用QueryParser可以让我们在指定的字段间使用模糊查询,也就是说,只要相应的记录之中包含检索值,都会被命中,这也正是全文搜索所必需的。如果不采用以上方式,可以使用BooleanQuery结合TermQuery在指定字段间搜索,但这样以来,只有同值记录(精确查询)会被命中。这些搜索条件间同样可以像数据库查询那样采用‘与或非’的形式。
最后说明一下:对于数值类型和日期类型的处理比较特殊,如果采用像字符串那样的处理方式,结果的精确性就会下降,至于如何处理针对数值类型和日期类型的数据检索,大家可以参考Lucene的官方文档。提及一下我的解决方案:我们可以采用常规数据库与Lucene结合的方式,让Lucene处理字符串类型的检索,常规数据库处理日期及数值类型的检索,各抒其长。
标签: Lucene.Net , C#
作者: Leo_wl
出处: http://HdhCmsTestcnblogs测试数据/Leo_wl/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
版权信息