biaweb_exporter.py

   1 # BiaWeb Website content manager (c) 2010 V.Harishankar
   2 # Site exporter/generator class
   3
   4 import os
   5 import os.path
   6 import sys
   7 import time
   8 import sqlite3
   9 import string
  10 import shutil
  11 import HTMLParser
  12 import cPickle
  13 import biaweb_db
  14 import biaweb_strings
  15
  16 # class to remove HTML tags from a string - used by the search
  17 # generator to get an article without HTML tags
  18 class HTMLTagRemover (HTMLParser.HTMLParser):
  19         def __init__ (self):
  20                 HTMLParser.HTMLParser.__init__ (self)
  21                 # initialize the list for data - this will be converted
  22                 # to a string before returning
  23                 self.data_list = []
  24
  25         # This event gets the data of the string that is anything that
  26         # is not a HTML tag
  27         def handle_data (self, data):
  28                 self.data_list.append (data)
  29
  30         # return the string thus collected by the handle_data event
  31         def get_raw_string (self):
  32                 raw_string = "".join (self.data_list)
  33                 return raw_string
  34
  35 # to format the best rated articles in a HTML link list
  36 def html_format_best_rated (best_rated):
  37         items = [ "<ul>\n", ]
  38         for art in best_rated:
  39                 # art[13] is category stub, art[8] is article stub
  40                 # thus forming the relative url as Category/Article.html
  41                 str_art = '<li><a href="' + art[13] + '/' + art[8] + '.html">' + art[1] + '</a></li>\n'
  42                 items.append (str_art)
  43         items.append ("</ul>\n")
  44         str_items = "".join (items)
  45         return str_items
  46
  47 # to format categories in a HTML link list
  48 def html_format_categories (cats):
  49         items = [ "<ul>\n", ]
  50         for cat in cats:
  51                 # cat[3] is category stub and cat[1] is category name
  52                 str_cat = '<li><a href="' + cat[3] + '/">' + cat[1] + '</a></li>\n'
  53                 items.append (str_cat)
  54         items.append ("</ul>\n")
  55         str_items = "".join (items)
  56         return str_items
  57
  58 # to convert a rating number into rating images out of 10 stars
  59 def html_format_rating (rating):
  60         items = []
  61         # if -1 then return unrated as the text
  62         if rating == -1:
  63                 return "unrated"
  64         # fill up the number of stars for the rating
  65         for i in range (rating):
  66                 items.append ('<img src="star.gif" alt="*" />')
  67         # fill up remaining slots (of 10) with grey stars
  68         for i in range (10 - rating):
  69                 items.append ('<img src="star-grey.gif" alt="-" />')
  70
  71         rating_str = "".join (items)
  72         return rating_str
  73
  74 # function to generate the search index file
  75 def generate_search_index (dbname, conf, full_text_index = True):
  76         # get all the articles
  77         arts = biaweb_db.site_articles (dbname)
  78         # if cannot get articles
  79         if arts is False:
  80                 return False
  81
  82         # if full text index, then field should be article content otherwise keywords
  83         if full_text_index:
  84                 searchfield = 4
  85         else:
  86                 searchfield = 3
  87
  88         # initialize the search index dictionary
  89         search_index = dict ()
  90
  91         # now run through the articles and generate a table of unique words (except
  92         # stop words)
  93         for art in arts:
  94                 # now strip out the HTML tags from the articles
  95                 parser = HTMLTagRemover ()
  96                 parser.feed (art[searchfield])
  97                 parser.close ()
  98                 # get the word list
  99                 word_list = parser.get_raw_string ().split ()
 100
 101                 # now run through each word, make it lowercase, remove all cruft from it
 102                 # and add it to a dictionary
 103                 for word in word_list:
 104                         cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
 105                         # if the word is not a "stop word", then add it to the search database
 106                         if cleanword not in biaweb_strings.stopwords:
 107                                 # title of the search entry should be the article title
 108                                 title = art[1]
 109                                 # url should be the article URL: http://siteurl/Category/Article.html
 110                                 url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
 111                                 # if search index has the word (as key)
 112                                 if search_index.has_key (cleanword):
 113                                         # add the title and url as a tuple to the set
 114                                         search_index[cleanword].add ((title, url))
 115                                 # create the key as the word
 116                                 else:
 117                                         # create a set for the keyword. Set will hold the tuples
 118                                         # representing article title and url
 119                                         search_index[cleanword] = set ()
 120                                         search_index[cleanword].add ((title, url))
 121
 122         # done now write the search database as a python pickle object of search_index
 123         search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
 124         htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
 125         try:
 126                 # open the file in write binary mode
 127                 fsearchindex = open (search_index_path, "wb")
 128                 # dump the dictionary as a pickle object in binary mode
 129                 cPickle.dump (search_index, fsearchindex, 2)
 130                 fsearchindex.close ()
 131                 # write the htaccess file to prevent opening the index file from web browser
 132                 fhtaccess = open (htaccess_path, "w+")
 133                 fhtaccess.write (biaweb_strings.searchindex_htaccess)
 134                 fhtaccess.close ()
 135         except (OSError, IOError):
 136                 return False
 137
 138         # finally return true
 139         return True
 140
 141 # function to copy additional files and folders to the destination path
 142 def copy_files_folders (conf, files_to_copy, folders_to_copy):
 143         # create the cgi-bin directory and try to copy search.py into the destination directory if possible
 144         # otherwise user must copy it manually
 145         search_script_path = os.path.join (sys.path[0], "search.py")
 146         if os.path.exists (search_script_path):
 147                 try:
 148                         os.mkdir (os.path.join (conf[5], "cgi-bin"))
 149                         shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
 150                 except (IOError, OSError):
 151                         return False
 152
 153         # try to copy the star rating images  to destination directory if possible
 154         # otherwise user must copy it manually
 155         rating_img_star = os.path.join (sys.path[0], "star.gif")
 156         rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif")
 157         if os.path.exists (rating_img_star):
 158                 try:
 159                         shutil.copy2 (rating_img_star, conf[5])
 160                 except (IOError, OSError):
 161                         return False
 162         if os.path.exists (rating_img_greystar):
 163                 try:
 164                         shutil.copy2 (rating_img_greystar, conf[5])
 165                 except (IOError, OSError):
 166                         return False
 167
 168         # additional files to copy
 169
 170         # first copy files
 171         # check if files to copy is not empty
 172         if files_to_copy <> []:
 173                 for src, dest in files_to_copy:
 174                         # get full path from relative path in dest
 175                         full_dest = os.path.join (conf[5], dest)
 176                         try:
 177                                 shutil.copy2 (src, full_dest)
 178                         except (IOError, OSError):
 179                                 return False
 180
 181         # additional folders to copy
 182
 183         # now copy the folders
 184         if folders_to_copy <> []:
 185                 for src, dest in folders_to_copy:
 186                         # get full path from relative path in dest
 187                         full_dest = os.path.join (conf[5], dest)
 188                         try:
 189                                 shutil.copytree (src, full_dest)
 190                         except (IOError, OSError):
 191                                 return False
 192
 193         # finally return true
 194         return True
 195
 196 # function to generate article pages
 197 def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
 198         # main template
 199         tpl_main = string.Template (templates[0][1])
 200         # article template
 201         tpl_articlebit = string.Template (templates[1][1])
 202
 203         # get all articles from the database
 204         articles = biaweb_db.site_articles (dbname)
 205         if articles is False:
 206                 return
 207
 208         # walk through each article and generate the file in the appropriate category
 209         # folder
 210         for art in articles:
 211                 art_cdate = time.ctime (art[5])
 212                 art_mdate = time.ctime (art[6])
 213                 rating_str = html_format_rating (art[9])
 214                 # now build the article from the article bit template
 215                 article_str = tpl_articlebit.safe_substitute (article_title = art[1],
 216                                                                                                         article_cdate = art_cdate,
 217                                                                                                         article_mdate = art_mdate,
 218                                                                                                         rating = rating_str,
 219                                                                                                         article_contents = art[4])
 220
 221                 # now build the article page
 222                 articlepage_str = tpl_main.safe_substitute (site_title = art[1],
 223                                                                                                         site_url = "http://" + conf[0],
 224                                                                                                         meta_keywords = art[3],
 225                                                                                                         meta_description = art[2],
 226                                                                                                         page_title = conf[1],
 227                                                                                                         page_desc = conf[3],
 228                                                                                                         contents_bit = article_str,
 229                                                                                                         list_of_categories = category_str,
 230                                                                                                         list_best_rated = bestrated_str,
 231                                                                                                         copyright = conf[6])
 232                 # write to the article file
 233                 try:
 234                         farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+")
 235                         farticle.write (articlepage_str)
 236                 except (OSError, IOError):
 237                         return False
 238
 239         # finally return true
 240         return True
 241
 242 # function to generate category directories and indices
 243 def generate_category_indices (dbname, conf, templates, category_str, bestrated_str, category_list):
 244         # main template
 245         tpl_main = string.Template (templates[0][1])
 246         # table bit
 247         tpl_tablebit = string.Template (templates[3][1])
 248         # table row bit
 249         tpl_trowbit = string.Template (templates[4][1])
 250
 251         # run through each category and generate category index page
 252         for cat in category_list:
 253                 try:
 254                         # create the category directory
 255                         os.mkdir (os.path.join (conf[5], cat[3]))
 256                 except (IOError, OSError):
 257                         return False
 258
 259                 # now get the list of articles for the specified category
 260                 articles_list = biaweb_db.site_articles (dbname, cat[0])
 261                 if articles_list is False:
 262                         return False
 263
 264                 tableitems = []
 265                 # run through the list of articles in category
 266                 for art in articles_list:
 267                         url = art[13] + "/" + art[8] + ".html"
 268                         creattime = time.ctime (art[5])
 269                         rating_str = html_format_rating (art[9])
 270                         # now build the table rows for each article
 271                         tableitem_str = tpl_trowbit.safe_substitute (article_url = url,
 272                                                                                                                 title = art[1],
 273                                                                                                                 created = creattime,
 274                                                                                                                 rating = rating_str)
 275                         tableitems.append (tableitem_str)
 276                 # generate the rows as a string
 277                 tablerows_str = "".join (tableitems)
 278
 279                 # now create the page template
 280                 table_str = tpl_tablebit.safe_substitute (category_title = cat[1],
 281                                                                                                                 category_desc = cat[2],
 282                                                                                                                 table_rows = tablerows_str)
 283
 284                 # now create the index page
 285                 categoryindex_str =  tpl_main.safe_substitute (site_title = conf[1] + " - " + cat[1],
 286                                                                                                                 site_url = "http://" + conf[0],
 287                                                                                                                 meta_keywords = conf[2],
 288                                                                                                                 meta_description = cat[2],
 289                                                                                                                 page_title = conf[1],
 290                                                                                                                 page_desc = conf[3],
 291                                                                                                                 contents_bit = table_str,
 292                                                                                                                 list_of_categories = category_str,
 293                                                                                                                 list_best_rated = bestrated_str,
 294                                                                                                                 copyright = conf[6])
 295
 296                 # now write to Category/index.html
 297                 try:
 298                         fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+")
 299                         fcatindex.write (categoryindex_str)
 300                         fcatindex.close ()
 301                 except (OSError, IOError):
 302                         return False
 303
 304         # finally return true
 305         return True
 306
 307 # function to generate the RSS feed for the website
 308 def generate_rss_feed (dbname, conf):
 309         # rss main template
 310         tpl_rss = string.Template (biaweb_strings.template_rss)
 311         # rss item bit template
 312         tpl_rss_itembit = string.Template (biaweb_strings.template_rss_item)
 313
 314         # get the latest articles (limit by number of rss items)
 315         arts = biaweb_db.site_latest_articles (dbname, conf[4])
 316         if arts is False:
 317                 return False
 318
 319         rss_item_list = []
 320         # run through the articles and generate the rss items
 321         for art in arts:
 322                 # link
 323                 itemlink = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
 324                 item_str = tpl_rss_itembit.safe_substitute (item_title = art[1],
 325                                                                                                         item_link = itemlink,
 326                                                                                                         description = art[2])
 327                 rss_item_list.append (item_str)
 328
 329         # now get the rss items as a string
 330         rss_item_str = "".join (rss_item_list)
 331
 332         # now generate the feed
 333         rss_str = tpl_rss.safe_substitute (title = conf[1],
 334                                                                                 link = "http://" + conf[0],
 335                                                                                 description = conf[3],
 336                                                                                 rss_items = rss_item_str)
 337
 338         # now try to write it to the rss file
 339         try:
 340                 frss = open (os.path.join (conf[5], "subscribe.xml"), "w+")
 341                 frss.write (rss_str)
 342         except (IOError, OSError):
 343                 return False
 344
 345         # finally return true
 346         return True
 347
 348 # function to generate main index file and stylesheet
 349 def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
 350         # main template
 351         tpl_main = string.Template (templates[0][1])
 352         # index bit
 353         tpl_indexbit = string.Template (templates[6][1])
 354         # news bits
 355         tpl_newsbit = string.Template (templates[2][1])
 356
 357         # get the latest articles - conf[4] is num of rss entries to be used also
 358         latest_arts = biaweb_db.site_latest_articles (dbname, conf[4])
 359         if latest_arts is False:
 360                 return False
 361
 362         news_items = []
 363
 364         # Run through the latest articles
 365         # for the num of latest news items on index
 366         for art in latest_arts:
 367                 # url is Category/Article.html
 368                 url = art[13] + "/" + art[8] + ".html"
 369                 # art[5] is creation time
 370                 strdate = time.ctime (art[5])
 371                 # now populate the template variables. art[1] is title, art[2] is summary
 372                 strnews = tpl_newsbit.safe_substitute (news_title = art[1],
 373                                                                                         news_link = url,
 374                                                                                         news_datetime = strdate,
 375                                                                                         news_description = art[2]
 376                                                                                         )
 377                 news_items.append (strnews)
 378         # now convert it into a string
 379         newsbit_str = "".join (news_items)
 380
 381         # now populate the index template
 382         indexbit_str = tpl_indexbit.safe_substitute (site_name = conf[1],
 383                                                                                                 news_updates = newsbit_str
 384                                                                                                 )
 385         # now populate the main page template
 386         main_str = tpl_main.safe_substitute (site_title = conf[1],
 387                                                                                 site_url = "http://" + conf[0],
 388                                                                                 meta_keywords = conf[2],
 389                                                                                 meta_description = conf[3],
 390                                                                                 page_title = conf[1],
 391                                                                                 page_desc = conf[3],
 392                                                                                 contents_bit = indexbit_str,
 393                                                                                 list_of_categories = category_str,
 394                                                                                 list_best_rated = bestrated_str,
 395                                                                                 copyright = conf[6])
 396
 397         # write the index.html file in the destination directory
 398         try:
 399                 findex = open (os.path.join (conf[5], "index.html"), "w+")
 400                 findex.write (main_str)
 401                 findex.close ()
 402         except (IOError, OSError):
 403                 return False
 404
 405         # write the style.css file in the destination directory
 406         try:
 407                 fstyle = open (os.path.join (conf[5], "style.css"), "w+")
 408                 fstyle.write (templates[5][1])
 409                 fstyle.close ()
 410         except (IOError, OSError):
 411                 return False
 412
 413         return True
 414
 415 # superfunction to generate the site
 416 def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True):
 417         # get the configuration
 418         conf = biaweb_db.get_configuration (dbname)
 419         # if cannot get configuration
 420         if conf is False:
 421                 return False
 422
 423         # get the templates
 424         tpls = biaweb_db.get_templates (dbname)
 425         if tpls is False:
 426                 return False
 427
 428         # get the list of categories
 429         cats = biaweb_db.get_categories (dbname)
 430         # cannot get categories return false
 431         if cats is False:
 432                 return False
 433
 434         # format the categories as a html bulleted list
 435         cats_str = html_format_categories (cats)
 436
 437         # get the best rated articles
 438         best_rated = biaweb_db.site_get_bestrated (dbname)
 439         # if cannot retrieve
 440         if best_rated is False:
 441                 return False
 442         # format the best rated articles as a html bulleted list
 443         best_rated_str = html_format_best_rated (best_rated)
 444
 445         # remove the destination tree and recreate it
 446         try:
 447                 if os.path.exists (conf[5]):
 448                         shutil.rmtree (conf[5])
 449                 os.mkdir (conf[5])
 450         except OSError:
 451                 return False
 452
 453         # generate the index page including style sheet
 454         ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str)
 455         if ret is False:
 456                 return False
 457
 458         # generate the rss feed
 459         ret = generate_rss_feed (dbname, conf)
 460         if ret is False:
 461                 return False
 462
 463         # generate the category directories and indices
 464         ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats)
 465         if ret is False:
 466                 return False
 467
 468         # generate the article pages
 469         ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str)
 470         if ret is False:
 471                 return False
 472
 473         # copy other files/folders into the destination path
 474         ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
 475         if ret is False:
 476                 return False
 477
 478         # now generate the search index database
 479         ret = generate_search_index (dbname, conf, search_type_full)
 480         if ret is False:
 481                 return False
 482
 483         # finally when all is successfully done return true
 484         return True