+#!/usr/bin/env python
+
+# hari's search engine front-end
+# this program queries the search index created by hari's search index creator application
+
+import string
+import cPickle
+import cgi
+import os
+import os.path
+import time
+
+# -----------------------------------------------------------
+# configuration settings
+# -----------------------------------------------------------
+
+# script start time
+strttime = time.time ()
+
+# path to the search database index file - edit as necessary
+search_db_path = "searchwords.idx"
+
+# path to file containing the "stop words" words which should be excluded from the query
+stopwords_list = """$
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+a
+able
+about
+after
+again
+all
+almost
+already
+also
+although
+am
+an
+and
+another
+any
+are
+are
+around
+as
+at
+b
+based
+be
+because
+been
+before
+being
+between
+both
+bring
+but
+by
+c
+came
+can
+com
+come
+comes
+could
+d
+did
+do
+does
+doing
+done
+e
+each
+eight
+else
+etc
+even
+every
+f
+five
+for
+four
+from
+g
+get
+gets
+getting
+go
+going
+got
+h
+had
+has
+have
+he
+he
+her
+here
+him
+himself
+his
+how
+however
+href
+http
+i
+if
+in
+including
+into
+is
+it
+it
+its
+j
+just
+k
+kb
+know
+l
+like
+looks
+m
+mailto
+make
+making
+many
+may
+mb
+me
+means
+might
+more
+more
+most
+move
+mr
+much
+must
+my
+n
+need
+needs
+never
+nice
+nine
+no
+not
+now
+o
+of
+often
+oh
+ok
+on
+on
+one
+only
+or
+org
+other
+our
+out
+over
+own
+p
+piece
+q
+r
+rather
+re
+really
+s
+said
+same
+say
+says
+see
+seven
+several
+she
+should
+since
+single
+six
+so
+so
+some
+something
+still
+stuff
+such
+t
+take
+ten
+than
+that
+the
+their
+them
+them
+then
+there
+there
+these
+they
+they
+thing
+things
+this
+those
+three
+through
+to
+too
+took
+two
+u
+under
+up
+us
+use
+used
+using
+usual
+v
+ve
+very
+via
+w
+want
+was
+way
+we
+we
+well
+were
+what
+when
+where
+whether
+which
+while
+whilst
+who
+why
+will
+with
+within
+would
+x
+y
+yes
+yet
+you
+your
+z""".split ("\n")
+
+# first print the header and the HTML code
+print "Content-type: text/html; charset=utf-8"
+print """
+<html>
+<head>
+ <title>Search Results</title>
+</head>
+<body>
+<h1>Search results</h1>
+<hr>
+"""
+
+# check whether the search database exists at all
+if not os.path.exists (search_db_path) and not os.path.isfile (search_db_path):
+ print "Search database file seems to be missing. Cannot search."
+ print "</body>"
+ print "</html>"
+ sys.exit ()
+
+frm = cgi.FieldStorage ()
+
+# if the form has not been submitted, then ask for it to be submitted
+if not frm.has_key ("fromsearch") or frm.getfirst("query") == "" :
+ print "No search query defined."
+ print "</body>"
+ print "</html>"
+else:
+ # get the query
+ raw_query = frm.getfirst ("query").strip ().lower ()
+ # get the mode of the query - AND or NOT
+ mode = int (frm.getfirst ("criteria"))
+
+ # split the terms
+ raw_query_list = raw_query.split ()
+
+ # get the database of search terms
+ try:
+ search_db = cPickle.load (open (search_db_path, "rb"))
+ except:
+ print "Error reading search database: might not valid."
+
+ # if the number of search words is too much
+ if len (raw_query_list) > 5:
+ print "<p>Too many terms. Try fewer search terms (avoid common words like a, and, or, in).</p>"
+ print "</body>"
+ print "</html>"
+ sys.exit ()
+
+
+ # display the list of search terms
+ print "Search terms: ",
+
+ # search words set
+ search_words = set ()
+
+ # add the search words to the set
+ for raw_word in raw_query_list:
+ # strip the raw word of all non-alphabetic characters at the beginnning and end
+ word = raw_word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t")
+
+ # only if the word is not in the stopwords list add it to the query
+ if not word in stopwords_list:
+ print "<b>" + word + "</b>, ",
+ search_words.add (word)
+ # just display the word with a strike-through to indicate its non-inclusion
+ else:
+ print "<strike>" + word + "</strike>, ",
+
+ # if there are no specific terms found
+ if len(search_words) == 0:
+ print "<p>Search terms are too broad and commonly used words. Try searching with at least one specific term.</p>"
+ print "</body>"
+ print "</html>"
+ sys.exit ()
+
+ # get the search results
+ search_results = set ()
+ flagfirst = True
+
+ # print search mode
+ print "<p>Search mode: ",
+ if (mode == 1):
+ print "documents containing all search terms</p>"
+ else:
+ print "documents containing any of the search terms</p>"
+
+ # get the words found in the database
+ for word in search_words:
+ # if the word is found
+ if search_db.has_key (word):
+ if (flagfirst == True):
+ search_results = search_db[word]
+ flagfirst = False
+ else:
+ # AND mode
+ if (mode == 1):
+ search_results = search_results & search_db[word]
+ # Assume OR mode
+ else:
+ search_results = search_results | search_db[word]
+ # if word is not found and the mode is AND, then return empty result
+ else:
+ if (mode == 1):
+ search_results.clear ()
+ break
+
+ # display the results
+
+ # if no results are found
+ if len (search_results) == 0:
+ print "<p>No pages matched your query.</p>"
+ else:
+ # display the result
+ print "<h2>Documents found</h2>"
+ print "<ul>"
+ for title, url in search_results:
+ print '<li><a href="' + url + '">' + title + '</a>'
+ print "</ul>"
+
+# print the script execution time
+endtime = time.time ()
+dif = endtime - strttime
+
+print "<hr>"
+print "<small>Script execution time: ",
+print dif,
+print "sec</small>"
+print "</body>"
+print "</html>"