Updated htaccess to Apache 2.4 compat
[biaweb_qt.git] / search.py
1 #!/usr/bin/env python
2
3 # hari's search engine front-end
4 # this program queries the search index created by hari's search index creator application
5
6 import string
7 import cPickle
8 import cgi
9 import os
10 import os.path
11 import time
12
13 # -----------------------------------------------------------
14 # configuration settings
15 # -----------------------------------------------------------
16
17 # script start time
18 strttime = time.time ()
19
20 # path to the search database index file - edit as necessary
21 search_db_path = "searchwords.idx"
22
23 # path to file containing the "stop words" words which should be excluded from the query
24 stopwords_list = """$
25 0
26 1
27 2
28 3
29 4
30 5
31 6
32 7
33 8
34 9
35 a
36 able
37 about
38 after
39 again
40 all
41 almost
42 already
43 also
44 although
45 am
46 an
47 and
48 another
49 any
50 are
51 are
52 around
53 as
54 at
55 b
56 based
57 be
58 because
59 been
60 before
61 being
62 between
63 both
64 bring
65 but
66 by
67 c
68 came
69 can
70 com
71 come
72 comes
73 could
74 d
75 did
76 do
77 does
78 doing
79 done
80 e
81 each
82 eight
83 else
84 etc
85 even
86 every
87 f
88 five
89 for
90 four
91 from
92 g
93 get
94 gets
95 getting
96 go
97 going
98 got
99 h
100 had
101 has
102 have
103 he
104 he
105 her
106 here
107 him
108 himself
109 his
110 how
111 however
112 href
113 http
114 i
115 if
116 in
117 including
118 into
119 is
120 it
121 it
122 its
123 j
124 just
125 k
126 kb
127 know
128 l
129 like
130 looks
131 m
132 mailto
133 make
134 making
135 many
136 may
137 mb
138 me
139 means
140 might
141 more
142 more
143 most
144 move
145 mr
146 much
147 must
148 my
149 n
150 need
151 needs
152 never
153 nice
154 nine
155 no
156 not
157 now
158 o
159 of
160 often
161 oh
162 ok
163 on
164 on
165 one
166 only
167 or
168 org
169 other
170 our
171 out
172 over
173 own
174 p
175 piece
176 q
177 r
178 rather
179 re
180 really
181 s
182 said
183 same
184 say
185 says
186 see
187 seven
188 several
189 she
190 should
191 since
192 single
193 six
194 so
195 so
196 some
197 something
198 still
199 stuff
200 such
201 t
202 take
203 ten
204 than
205 that
206 the
207 their
208 them
209 them
210 then
211 there
212 there
213 these
214 they
215 they
216 thing
217 things
218 this
219 those
220 three
221 through
222 to
223 too
224 took
225 two
226 u
227 under
228 up
229 us
230 use
231 used
232 using
233 usual
234 v
235 ve
236 very
237 via
238 w
239 want
240 was
241 way
242 we
243 we
244 well
245 were
246 what
247 when
248 where
249 whether
250 which
251 while
252 whilst
253 who
254 why
255 will
256 with
257 within
258 would
259 x
260 y
261 yes
262 yet
263 you
264 your
265 z""".split ("\n")
266
267 # first print the header and the HTML code
268 print "Content-type: text/html; charset=utf-8"
269 print """
270 <html>
271 <head>
272 <title>Search Results</title>
273 </head>
274 <body>
275 <h1>Search results</h1>
276 <hr>
277 """
278
279 # check whether the search database exists at all
280 if not os.path.exists (search_db_path) and not os.path.isfile (search_db_path):
281 print "Search database file seems to be missing. Cannot search."
282 print "</body>"
283 print "</html>"
284 sys.exit ()
285
286 frm = cgi.FieldStorage ()
287
288 # if the form has not been submitted, then ask for it to be submitted
289 if not frm.has_key ("fromsearch") or frm.getfirst("query") == "" :
290 print "No search query defined."
291 print "</body>"
292 print "</html>"
293 else:
294 # get the query
295 raw_query = frm.getfirst ("query").strip ().lower ()
296 # get the mode of the query - AND or NOT
297 mode = int (frm.getfirst ("criteria"))
298
299 # split the terms
300 raw_query_list = raw_query.split ()
301
302 # get the database of search terms
303 try:
304 search_db = cPickle.load (open (search_db_path, "rb"))
305 except:
306 print "Error reading search database: might not valid."
307
308 # if the number of search words is too much
309 if len (raw_query_list) > 5:
310 print "<p>Too many terms. Try fewer search terms (avoid common words like a, and, or, in).</p>"
311 print "</body>"
312 print "</html>"
313 sys.exit ()
314
315
316 # display the list of search terms
317 print "Search terms: ",
318
319 # search words set
320 search_words = set ()
321
322 # add the search words to the set
323 for raw_word in raw_query_list:
324 # strip the raw word of all non-alphabetic characters at the beginnning and end
325 word = raw_word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t")
326
327 # only if the word is not in the stopwords list add it to the query
328 if not word in stopwords_list:
329 print "<b>" + word + "</b>, ",
330 search_words.add (word)
331 # just display the word with a strike-through to indicate its non-inclusion
332 else:
333 print "<strike>" + word + "</strike>, ",
334
335 # if there are no specific terms found
336 if len(search_words) == 0:
337 print "<p>Search terms are too broad and commonly used words. Try searching with at least one specific term.</p>"
338 print "</body>"
339 print "</html>"
340 sys.exit ()
341
342 # get the search results
343 search_results = set ()
344 flagfirst = True
345
346 # print search mode
347 print "<p>Search mode: ",
348 if (mode == 1):
349 print "documents containing all search terms</p>"
350 else:
351 print "documents containing any of the search terms</p>"
352
353 # get the words found in the database
354 for word in search_words:
355 # if the word is found
356 if search_db.has_key (word):
357 if (flagfirst == True):
358 search_results = search_db[word]
359 flagfirst = False
360 else:
361 # AND mode
362 if (mode == 1):
363 search_results = search_results & search_db[word]
364 # Assume OR mode
365 else:
366 search_results = search_results | search_db[word]
367 # if word is not found and the mode is AND, then return empty result
368 else:
369 if (mode == 1):
370 search_results.clear ()
371 break
372
373 # display the results
374
375 # if no results are found
376 if len (search_results) == 0:
377 print "<p>No pages matched your query.</p>"
378 else:
379 # display the result
380 print "<h2>Documents found</h2>"
381 print "<ul>"
382 for title, url in search_results:
383 print '<li><a href="' + url + '">' + title + '</a>'
384 print "</ul>"
385
386 # print the script execution time
387 endtime = time.time ()
388 dif = endtime - strttime
389
390 print "<hr>"
391 print "<small>Script execution time: ",
392 print dif,
393 print "sec</small>"
394 print "</body>"
395 print "</html>"