X-Git-Url: https://harishankar.org/repos/?p=biaweb_qt.git;a=blobdiff_plain;f=search.py;fp=search.py;h=cc5ab6e618e42c76608cad63eec4133510f367e8;hp=0000000000000000000000000000000000000000;hb=826408979db0e8e4675d51def6ce7dadd305cf9c;hpb=b168780a0609f1a8ca2632d6f5e783f200e594d3 diff --git a/search.py b/search.py new file mode 100755 index 0000000..cc5ab6e --- /dev/null +++ b/search.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python + +# hari's search engine front-end +# this program queries the search index created by hari's search index creator application + +import string +import cPickle +import cgi +import os +import os.path +import time + +# ----------------------------------------------------------- +# configuration settings +# ----------------------------------------------------------- + +# script start time +strttime = time.time () + +# path to the search database index file - edit as necessary +search_db_path = "searchwords.idx" + +# path to file containing the "stop words" words which should be excluded from the query +stopwords_list = """$ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +able +about +after +again +all +almost +already +also +although +am +an +and +another +any +are +are +around +as +at +b +based +be +because +been +before +being +between +both +bring +but +by +c +came +can +com +come +comes +could +d +did +do +does +doing +done +e +each +eight +else +etc +even +every +f +five +for +four +from +g +get +gets +getting +go +going +got +h +had +has +have +he +he +her +here +him +himself +his +how +however +href +http +i +if +in +including +into +is +it +it +its +j +just +k +kb +know +l +like +looks +m +mailto +make +making +many +may +mb +me +means +might +more +more +most +move +mr +much +must +my +n +need +needs +never +nice +nine +no +not +now +o +of +often +oh +ok +on +on +one +only +or +org +other +our +out +over +own +p +piece +q +r +rather +re +really +s +said +same +say +says +see +seven +several +she +should +since +single +six +so +so +some +something +still +stuff +such +t +take +ten +than +that +the +their +them +them +then +there +there +these +they +they +thing +things +this +those +three +through +to +too +took +two +u +under +up +us +use +used +using +usual +v +ve +very +via +w +want +was +way +we +we +well +were +what +when +where +whether +which +while +whilst +who +why +will +with +within +would +x +y +yes +yet +you +your +z""".split ("\n") + +# first print the header and the HTML code +print "Content-type: text/html; charset=utf-8" +print """ + + + Search Results + + +

Search results

+
+""" + +# check whether the search database exists at all +if not os.path.exists (search_db_path) and not os.path.isfile (search_db_path): + print "Search database file seems to be missing. Cannot search." + print "" + print "" + sys.exit () + +frm = cgi.FieldStorage () + +# if the form has not been submitted, then ask for it to be submitted +if not frm.has_key ("fromsearch") or frm.getfirst("query") == "" : + print "No search query defined." + print "" + print "" +else: + # get the query + raw_query = frm.getfirst ("query").strip ().lower () + # get the mode of the query - AND or NOT + mode = int (frm.getfirst ("criteria")) + + # split the terms + raw_query_list = raw_query.split () + + # get the database of search terms + try: + search_db = cPickle.load (open (search_db_path, "rb")) + except: + print "Error reading search database: might not valid." + + # if the number of search words is too much + if len (raw_query_list) > 5: + print "

Too many terms. Try fewer search terms (avoid common words like a, and, or, in).

" + print "" + print "" + sys.exit () + + + # display the list of search terms + print "Search terms: ", + + # search words set + search_words = set () + + # add the search words to the set + for raw_word in raw_query_list: + # strip the raw word of all non-alphabetic characters at the beginnning and end + word = raw_word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t") + + # only if the word is not in the stopwords list add it to the query + if not word in stopwords_list: + print "" + word + ", ", + search_words.add (word) + # just display the word with a strike-through to indicate its non-inclusion + else: + print "" + word + ", ", + + # if there are no specific terms found + if len(search_words) == 0: + print "

Search terms are too broad and commonly used words. Try searching with at least one specific term.

" + print "" + print "" + sys.exit () + + # get the search results + search_results = set () + flagfirst = True + + # print search mode + print "

Search mode: ", + if (mode == 1): + print "documents containing all search terms

" + else: + print "documents containing any of the search terms

" + + # get the words found in the database + for word in search_words: + # if the word is found + if search_db.has_key (word): + if (flagfirst == True): + search_results = search_db[word] + flagfirst = False + else: + # AND mode + if (mode == 1): + search_results = search_results & search_db[word] + # Assume OR mode + else: + search_results = search_results | search_db[word] + # if word is not found and the mode is AND, then return empty result + else: + if (mode == 1): + search_results.clear () + break + + # display the results + + # if no results are found + if len (search_results) == 0: + print "

No pages matched your query.

" + else: + # display the result + print "

Documents found

" + print "" + +# print the script execution time +endtime = time.time () +dif = endtime - strttime + +print "
" +print "Script execution time: ", +print dif, +print "sec" +print "" +print ""