Implemented additional files/folders adding
[biaweb_qt.git] / search.py
diff --git a/search.py b/search.py
new file mode 100755 (executable)
index 0000000..cc5ab6e
--- /dev/null
+++ b/search.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+
+# hari's search engine front-end
+# this program queries the search index created by hari's search index creator application
+
+import string
+import cPickle
+import cgi
+import os
+import os.path
+import time
+
+# -----------------------------------------------------------
+# configuration settings
+# -----------------------------------------------------------
+
+# script start time
+strttime = time.time ()
+
+# path to the search database index file - edit as necessary
+search_db_path = "searchwords.idx"
+
+# path to file containing the "stop words" words which should be excluded from the query
+stopwords_list = """$
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+a
+able
+about
+after
+again
+all
+almost
+already
+also
+although
+am
+an
+and
+another
+any
+are
+are
+around
+as
+at
+b
+based
+be
+because
+been
+before
+being
+between
+both
+bring
+but
+by
+c
+came
+can
+com
+come
+comes
+could
+d
+did
+do
+does
+doing
+done
+e
+each
+eight
+else
+etc
+even
+every
+f
+five
+for
+four
+from
+g
+get
+gets
+getting
+go
+going
+got
+h
+had
+has
+have
+he
+he
+her
+here
+him
+himself
+his
+how
+however
+href
+http
+i
+if
+in
+including
+into
+is
+it
+it
+its
+j
+just
+k
+kb
+know
+l
+like
+looks
+m
+mailto
+make
+making
+many
+may
+mb
+me
+means
+might
+more
+more
+most
+move
+mr
+much
+must
+my
+n
+need
+needs
+never
+nice
+nine
+no
+not
+now
+o
+of
+often
+oh
+ok
+on
+on
+one
+only
+or
+org
+other
+our
+out
+over
+own
+p
+piece
+q
+r
+rather
+re
+really
+s
+said
+same
+say
+says
+see
+seven
+several
+she
+should
+since
+single
+six
+so
+so
+some
+something
+still
+stuff
+such
+t
+take
+ten
+than
+that
+the
+their
+them
+them
+then
+there
+there
+these
+they
+they
+thing
+things
+this
+those
+three
+through
+to
+too
+took
+two
+u
+under
+up
+us
+use
+used
+using
+usual
+v
+ve
+very
+via
+w
+want
+was
+way
+we
+we
+well
+were
+what
+when
+where
+whether
+which
+while
+whilst
+who
+why
+will
+with
+within
+would
+x
+y
+yes
+yet
+you
+your
+z""".split ("\n")
+
+# first print the header and the HTML code
+print "Content-type: text/html; charset=utf-8"
+print """
+<html>
+<head>
+       <title>Search Results</title>
+</head>
+<body>
+<h1>Search results</h1>
+<hr>
+"""
+
+# check whether the search database exists at all
+if not os.path.exists (search_db_path) and not os.path.isfile (search_db_path):
+       print "Search database file seems to be missing. Cannot search."
+       print "</body>"
+       print "</html>"
+       sys.exit ()
+
+frm = cgi.FieldStorage ()
+
+# if the form has not been submitted, then ask for it to be submitted
+if not frm.has_key ("fromsearch") or frm.getfirst("query") == "" :
+       print "No search query defined."
+       print "</body>"
+       print "</html>"
+else:
+       # get the query
+       raw_query = frm.getfirst ("query").strip ().lower ()
+       # get the mode of the query - AND or NOT
+       mode = int (frm.getfirst ("criteria"))
+       
+       # split the terms
+       raw_query_list = raw_query.split ()
+               
+       # get the database of search terms
+       try:
+               search_db = cPickle.load (open (search_db_path, "rb"))
+       except:
+               print "Error reading search database: might not valid."
+       
+       # if the number of search words is too much
+       if len (raw_query_list) > 5:
+               print "<p>Too many terms. Try fewer search terms (avoid common words like a, and, or, in).</p>"
+               print "</body>"
+               print "</html>"
+               sys.exit ()
+               
+       
+       # display the list of search terms
+       print "Search terms: ",
+       
+       # search words set
+       search_words = set ()
+       
+       # add the search words to the set
+       for raw_word in raw_query_list:
+               # strip the raw word of all non-alphabetic characters at the beginnning and end
+               word = raw_word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t")
+               
+               # only if the word is not in the stopwords list add it to the query
+               if not word in stopwords_list:
+                       print "<b>" + word + "</b>, ",
+                       search_words.add (word)
+               # just display the word with a strike-through to indicate its non-inclusion
+               else:
+                       print "<strike>" + word + "</strike>, ",
+       
+       # if there are no specific terms found
+       if len(search_words) == 0:
+               print "<p>Search terms are too broad and commonly used words. Try searching with at least one specific term.</p>"
+               print "</body>"
+               print "</html>"
+               sys.exit ()
+       
+       # get the search results
+       search_results = set ()
+       flagfirst = True
+       
+       # print search mode
+       print "<p>Search mode: ",
+       if (mode == 1):
+               print "documents containing all search terms</p>"
+       else:
+               print "documents containing any of the search terms</p>"
+       
+       # get the words found in the database
+       for word in search_words:
+               # if the word is found
+               if search_db.has_key (word):
+                       if (flagfirst == True):
+                               search_results = search_db[word]
+                               flagfirst = False
+                       else:
+                               # AND mode
+                               if (mode == 1):
+                                       search_results = search_results & search_db[word]
+                               # Assume OR mode
+                               else:
+                                       search_results = search_results | search_db[word]
+               # if word is not found and the mode is AND, then return empty result
+               else:
+                       if (mode == 1):
+                               search_results.clear ()
+                               break
+       
+       # display the results
+       
+       # if no results are found
+       if len (search_results) == 0:
+               print "<p>No pages matched your query.</p>"
+       else:
+               # display the result
+               print "<h2>Documents found</h2>"
+               print "<ul>"
+               for title, url in search_results:
+                       print '<li><a href="' + url + '">' + title + '</a>'
+               print "</ul>"
+
+# print the script execution time
+endtime = time.time ()
+dif = endtime - strttime
+
+print "<hr>"
+print "<small>Script execution time: ",
+print dif,
+print "sec</small>"
+print "</body>"
+print "</html>"