前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。
pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。
建立索引:

#
!/usr/bin/env python


import
os

import
PyLucene



class
IndexFiles:
“””
create index by PyLucene, just need your dir path,
the result files saved in the directory index in
current path
“””

def
__init__
(self, root, storeDir, analyzer):
if
not
os.path.exists(storeDir):
os.mkdir(storeDir)
analyzer =
PyLucene.StandardAnalyzer()
store =
PyLucene.FSDirectory.getDirectory(storeDir , True)
writer =
PyLucene.IndexWriter(store, analyzer, True)
self.indexDocs(root, writer)
‘
optimizing index
‘
,
writer.optimize()
writer.close()
‘
done
‘

def
indexDocs(self, root, writer):
for
root, dirnames, filenames
in
os.walk(root):
for
filename
in
filenames:
if
not
filename.endswith(
‘
.txt
‘
):
continue
“
adding
“
, filename
try
:
path =
os.path.join(root, filename)
file =
open(path)
contents =
unicode(file.read(),
‘
gbk
‘
)
file.close()
doc =
PyLucene.Document()
doc.add(PyLucene.Field.Keyword(u“
name
“
, filename.decode(
‘
gbk
‘
)))
doc.add(PyLucene.Field.Text(u“
path
“
, path.decode(
‘
gbk
‘
)))
if
len(contents)
>
0:
pass
doc.add(PyLucene.Field.Text(u“
contents
“
, contents))
else
:
“
warning: no content in %s
“
%
filename
writer.addDocument(doc)
except
Exception, e:
“
Failed in indexDocs:
“
, e


def
indexmain(path):
try
:
IndexFiles(path, “
index
“
, PyLucene.StandardAnalyzer())
return
”
except
Exception, e:
return
e

if
__name__
==
‘
__main__
‘
:
indexpath =
raw_input(
“
path:
“
)
indexmain(indexpath)
检索,高亮显示:

#
!/usr/bin/env python


import
time

from
StringIO
import
StringIO

from
PyLucene
import
*



class
TestFormatter(Formatter):
def
__init__
(self):
pass

def
highlightTerm(self, originalText, group):
if
group.getTotalScore()
<=
0:
return
originalText
return
“
<font color=”red”>
“
+
originalText
+
“
</font>
“


class
Search:
def
__init__
(self):
STORE_DIR =
“
index
“
self.directory =
FSDirectory.getDirectory(STORE_DIR, False)
self.analyzer =
ChineseAnalyzer()
self.maxNumFragmentsRequired =
2
self.fragmentSeparator =
u
“
…
“
def
search(self, query, start):
searcher =
IndexSearcher(self.directory)
query =
query.decode(
‘
gbk
‘
)
query =
QueryParser.parse(query,
“
contents
“
, self.analyzer)
starttime =
time.time()
hits =
searcher.search(query)
formatter =
TestFormatter()
highlighter =
Highlighter(formatter, QueryScorer(query))
highlighter.setTextFragmenter(SimpleFragmenter(60
))
resultdic =
{}
totalnum =
hits.length()
for
i
in
range(
10
):
index =
start
+
i
if
index
>=
totalnum:
break
try
:
doc =
hits.doc(index)
except
:
continue
text =
doc.get(
“
contents
“
)
tokenStream =
self.analyzer.tokenStream(
“
contents
“
, StringIO(text))
result =
highlighter.getBestFragments(
tokenStream,
text,
self.maxNumFragmentsRequired,
self.fragmentSeparator)
score =
hits.score(index)
if
resultdic.has_key(score):
score +=
0.0001
resultdic[score] =
[result, doc.get(
“
path
“
)]
stoptime =
time.time()
usetime =
stoptime
–
starttime
searcher.close()
ks =
resultdic.keys()
return
resultdic, totalnum, usetime


if
__name__
==
‘
__main__
‘
:
tt =
Search()
command =
raw_input(
“
Query:
“
).decode(
‘
gbk
‘
)
tt.search(command, 0)
处理中文时注意编码
版权声明:本文为chuter原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。