前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。

pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。

建立索引:


#
!/usr/bin/env python




import
 os

import
 PyLucene



class
 IndexFiles:
    

“””

    create index by PyLucene, just need your dir path,
    the result files saved in the directory index in
    current path
    

“””


    

def
 
__init__
(self, root, storeDir, analyzer):
        

if
 
not
 os.path.exists(storeDir):
            os.mkdir(storeDir)
        analyzer 

=
 PyLucene.StandardAnalyzer()       
        store 

=
 PyLucene.FSDirectory.getDirectory(storeDir , True)
        writer 

=
 PyLucene.IndexWriter(store, analyzer, True)
        self.indexDocs(root, writer)
        

print
 

optimizing index

,
        writer.optimize()
        writer.close()
        

print
 

done



    

def
 indexDocs(self, root, writer):
        

for
 root, dirnames, filenames 
in
 os.walk(root):
            

for
 filename 
in
 filenames:
                

if
 
not
 filename.endswith(

.txt

):
                    

continue

                

print
 

adding

, filename
                

try
:
                    path 

=
 os.path.join(root, filename)
                    file 

=
 open(path)
                    contents 

=
 unicode(file.read(), 

gbk

)
                    file.close()
                    doc 

=
 PyLucene.Document()
                    doc.add(PyLucene.Field.Keyword(u


name

, filename.decode(

gbk

)))
                    doc.add(PyLucene.Field.Text(u


path

, path.decode(

gbk

)))
                    

if
 len(contents) 
>
 0:
                        

pass

                        doc.add(PyLucene.Field.Text(u


contents

, contents))
                    

else
:
                        

print
 

warning: no content in %s

 
%
 filename
                    writer.addDocument(doc)
                

except
 Exception, e:
                    

print
 

Failed in indexDocs:

, e


def
 indexmain(path):
    

try
:
        IndexFiles(path, 


index

, PyLucene.StandardAnalyzer())
        

return
 


    

except
 Exception, e:
        

return
 e
    

if
 
__name__
 
==
 

__main__

:
    indexpath 

=
 raw_input(

path: 

)
    indexmain(indexpath)

检索,高亮显示:

 


#
!/usr/bin/env python




import
 time

from
 StringIO 
import
 StringIO

from
 PyLucene 
import
 
*




class
 TestFormatter(Formatter):
    
    
    

def
 
__init__
(self):
        

pass


    

def
 highlightTerm(self, originalText, group):
        

if
 group.getTotalScore() 
<=
 0:
            

return
 originalText    
        

return
 

<font color=”red”>

 
+
 originalText 
+
 

</font>




class
 Search:
    
    
    

def
 
__init__
(self):
        STORE_DIR 

=
 

index


        self.directory 

=
 FSDirectory.getDirectory(STORE_DIR, False)
        self.analyzer 

=
 ChineseAnalyzer()
        self.maxNumFragmentsRequired 

=
 
2

        self.fragmentSeparator 

=
 u




        
    

def
 search(self, query, start):
        searcher 

=
 IndexSearcher(self.directory)  
        query 

=
 query.decode(

gbk

)
        query 

=
 QueryParser.parse(query, 

contents

, self.analyzer)
        starttime 

=
 time.time()
        hits 

=
 searcher.search(query)        
        formatter 

=
 TestFormatter()
        highlighter 

=
 Highlighter(formatter, QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(

60
))
        resultdic 

=
 {}
        totalnum 

=
 hits.length()
        

for
 i 
in
 range(
10
):
            index 

=
 start 
+
 i
            

if
 index 
>=
 totalnum:
                

break

            

try
:
                doc 

=
 hits.doc(index)
            

except
:
                

continue

            text 

=
 doc.get(

contents

)
            tokenStream 

=
 self.analyzer.tokenStream(

contents

, StringIO(text)) 
            result 

=
 highlighter.getBestFragments(
              tokenStream,
              text,
              self.maxNumFragmentsRequired,
              self.fragmentSeparator)
            score 

=
 hits.score(index)
            

if
 resultdic.has_key(score):
                score 

+=
 
0.0001

            resultdic[score] 

=
 [result, doc.get(

path

)]
        stoptime 

=
 time.time()
        usetime 

=
 stoptime 

 starttime            
        searcher.close()    
        ks 

=
 resultdic.keys()
                
        

return
 resultdic, totalnum, usetime
        


if
 
__name__
 
==
 

__main__

:
    tt 

=
 Search()
    command 

=
 raw_input(

Query:

).decode(

gbk

)
    tt.search(command, 0)

处理中文时注意编码

 


版权声明:本文为chuter原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
原文链接:https://blog.csdn.net/chuter/article/details/1672364