前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。
pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。
建立索引:

#
!/usr/bin/env python


import
os

import
PyLucene



class
IndexFiles:

“””




“””


def
__init__
(self, root, storeDir, analyzer):

if
not
os.path.exists(storeDir):


=
PyLucene.StandardAnalyzer()

=
PyLucene.FSDirectory.getDirectory(storeDir , True)

=
PyLucene.IndexWriter(store, analyzer, True)


‘
optimizing index
‘
,



‘
done
‘


def
indexDocs(self, root, writer):

for
root, dirnames, filenames
in
os.walk(root):

for
filename
in
filenames:

if
not
filename.endswith(
‘
.txt
‘
):

continue

“
adding
“
, filename

try
:

=
os.path.join(root, filename)

=
open(path)

=
unicode(file.read(),
‘
gbk
‘
)


=
PyLucene.Document()

“
name
“
, filename.decode(
‘
gbk
‘
)))

“
path
“
, path.decode(
‘
gbk
‘
)))

if
len(contents)
>
0:

pass

“
contents
“
, contents))

else
:

“
warning: no content in %s
“
%
filename


except
Exception, e:

“
Failed in indexDocs:
“
, e


def
indexmain(path):

try
:

“
index
“
, PyLucene.StandardAnalyzer())

return
”

except
Exception, e:

return
e


if
__name__
==
‘
__main__
‘
:

=
raw_input(
“
path:
“
)


检索,高亮显示:

#
!/usr/bin/env python


import
time

from
StringIO
import
StringIO

from
PyLucene
import
*



class
TestFormatter(Formatter):



def
__init__
(self):

pass


def
highlightTerm(self, originalText, group):

if
group.getTotalScore()
<=
0:

return
originalText

return
“
<font color=”red”>
“
+
originalText
+
“
</font>
“


class
Search:



def
__init__
(self):

=
“
index
“

=
FSDirectory.getDirectory(STORE_DIR, False)

=
ChineseAnalyzer()

=
2

=
u
“
…
“


def
search(self, query, start):

=
IndexSearcher(self.directory)

=
query.decode(
‘
gbk
‘
)

=
QueryParser.parse(query,
“
contents
“
, self.analyzer)

=
time.time()

=
searcher.search(query)

=
TestFormatter()

=
Highlighter(formatter, QueryScorer(query))

60
))

=
{}

=
hits.length()

for
i
in
range(
10
):

=
start
+
i

if
index
>=
totalnum:

break

try
:

=
hits.doc(index)

except
:

continue

=
doc.get(
“
contents
“
)

=
self.analyzer.tokenStream(
“
contents
“
, StringIO(text))

=
highlighter.getBestFragments(





=
hits.score(index)

if
resultdic.has_key(score):

+=
0.0001

=
[result, doc.get(
“
path
“
)]

=
time.time()

=
stoptime
–
starttime


=
resultdic.keys()


return
resultdic, totalnum, usetime



if
__name__
==
‘
__main__
‘
:

=
Search()

=
raw_input(
“
Query:
“
).decode(
‘
gbk
‘
)


处理中文时注意编码
版权声明:本文为chuter原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。