#!./exu

-- How to use EUforum Search
-- 
-- Searches are not case-sensitive. 
--
-- Matching messages are displayed in order of score. Logical connectives 
-- such as "and" and "or" are not supported, but the scoring system always 
-- favors messages that contain many different search words, over messages 
-- that contain the same search word many times. For example, if you search
-- for:
--      object oriented
--
-- then messages that contain both "object" and "oriented" (an exact match)
-- will always rank higher than messages that contain only one of these words,
-- even where there are many occurrences of that one word.
--
-- The results will indicate the number of exact matches, versus the
-- number of partial matches, and the exact matches will be listed first.
--
-- To search for a phrase, put double-quotes around the words in the phrase. 
-- e.g  "Euphoria programming language"
--
-- Specifying "Posted by" will limit the search to messages where the
-- "From:" line contains the string that you provide. If no keywords are
-- specified, the messages by that poster will be sorted most recent first.
--
-- If you specify neither "Posted By" nor "Keywords", you'll get a
-- chronological listing of messages in the chosen date range, most
-- recent message first.
--
-- To extract all the messages for a given thread, click on the Subject
-- line of any message in that thread. The complete thread will be 
-- displayed in chronological order.
--
-- Recent messages in the search results (past week or so) can also be viewed
-- on the EUforum message board. Just click the link 
-- "view this message in EUforum". This will also make it easier to 
-- reply to the message.
-- 
-- How it works internally:
--
-- Several years of EUforum messages (well over 100 Mb) are stored
-- in monthly files on OpenEuphoria.org and are updated immediately 
-- when a new message is posted.
-- 
-- The search is speeded up by having a master index file of all messages. 
-- It contains the poster's name and a special "signature" that records 
-- all the 2-letter combinations that exist in that message. The signature 
-- consists of 676 bits (26x26). When searching for a keyword, only the 
-- messages that have all the necessary 2-letter combinations 
-- contained in that keyword are searched.
--
-- The search is also speeded up by keeping a cache of the most recent
-- words that have been searched for, with all of the scoring information
-- for those words. This is particularly helpful when the user moves
-- to the next page of results, and also when he performs a modified
-- search using some of the same words as before.


without type_check

include machine.e
include file.e
include get.e
include wildcard.e as wild
include dll.e

-- one-byte tags for master index file
constant TAG_MONTH = 1,
	 TAG_SIGNATURE = 2,
	 TAG_FROM = 3,
	 TAG_OFFSET = 4

constant SIG_SIZE = 26   -- a..z
constant SIG_LEN = floor(SIG_SIZE * SIG_SIZE / 8)+1
constant TRUE = 1, FALSE = 0
constant TO_LOWER = 'a' - 'A'
constant SCORE = 1, LOCATION = 2 -- for top_hits
constant OUT_CHUNK_SIZE = 5  -- number of messages to output per table
constant LINE_WIDTH = 85  -- wrap output lines
constant UNKNOWN = 255
constant BIG_VALUE = 1000 -- large value for matching a search word
constant EOF = -1
constant COMPRESSED = FALSE  -- is monthly data compressed?
constant M_CODE="123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -- one-char codes month & year

type boolean(integer x)
    return x = TRUE or x = FALSE
end type

atom t0
t0 = time()

sequence top_hits  -- the best messages so far
top_hits = {}

sequence the_date
the_date = date()
the_date[1] += 1900

object query
query = ""
integer fromMonth, fromYear, toMonth, toYear
sequence postedBy, keywords
boolean thread  -- is this a search to find members of a thread?
thread = FALSE
object first_res

integer max_hits  -- max number of matching messages to collect
max_hits = 25 
integer max_per_page
max_per_page = 25
integer nhits, npartial, totalCount, scanned
nhits = 0
npartial = 0
totalCount = 0
scanned = 0

integer log_file
log_file = -1  -- open("esearch.log", "a")

function crash(object x)
-- in case of fire ...
    integer errfile
    
    errfile = open("ex.err", "a")
    puts(errfile, "\nquery is: " & query & '\n')
    close(errfile)

    -- send an e-mail containing ex.err
    system("mail -s \"esearch.exu crashed!\" rds@RapidEuphoria.com < ex.err > /dev/null", 2) 
    
    return 0
end function

crash_routine(routine_id("crash"))

procedure log_msg(sequence text)
-- record a message in the log
    if log_file != -1 then
	puts(log_file, text)
	puts(log_file, '\n')
	flush(log_file)
    end if
end procedure

function lower(sequence s)
-- (quickly) convert a line to lower case 
    integer c

    for i = 1 to length(s) do
	c = s[i]
	if c <= 'Z' then
	    if c >= 'A' then
		s[i] = c + TO_LOWER 
	    end if
	end if
    end for
    return s
end function

function delete_leading_white(sequence s)
-- delete leading whitespace    
    while length(s) and find(s[1], " \t\n\r") do
        s = s[2..length(s)]
    end while
    return s
end function

function delete_trailing_white(sequence s)
-- delete trailing whitespace
    while length(s) and find(s[length(s)], " \t\n\r") do
        s = s[1..length(s)-1]
    end while
    return s
end function

procedure html_puts(object s)
-- write out some HTML    
    puts(1, s)
end procedure

procedure html_printf(sequence format, object s)
-- write out some formatted HTML    
    printf(1, format, s)
end procedure

procedure stats()
-- save some stats for performance analysis 
    log_msg(sprintf("matched %d of %d, scanned %d, time: %.2f\n",
	   {nhits, totalCount, scanned, time()-t0}))
end procedure

procedure warnMessage(sequence msg)
-- issue a message but don't quit    
    html_puts("<p><font face=\"verdana, arial, geneva\" size=-1 color=\"#333333\">")
    html_printf("%s </font>\n</body></html>\n", {msg})
    flush(1)
    log_msg(msg)
    stats()
end procedure

procedure errMessage(sequence msg)
-- issue a fatal error message and quit 
    warnMessage(msg)
    abort(1)
end procedure

sequence dat

procedure save_words()
-- save scoring info to the word cache 
    integer fn, c, run, prev
    
    if thread then
        return  -- we don't have the correct score, only 3 months anyway
    end if
    
    for i = 1 to length(keywords) do
	fn = open("words/word_" & keywords[i], "wb")
	if fn != -1 then
	    puts(fn, int_to_bytes(length(dat[i])))
	    run = 0
	    prev = -1
	    -- use run-length compression on 0's and UNKNOWN's
	    for j = 1 to length(dat[i]) do
		c = dat[i][j]
		if c = 0 or c = UNKNOWN then
		    if prev = c then
			run += 1
			if run = 256 then
			   puts(fn, 255)
			   puts(fn, c)
			   run = 1
			end if
		    else
			if run then
			    puts(fn, run)
			end if
			puts(fn, c)
			run = 1
		    end if
		else
		    if run then
			puts(fn, run)
		    end if
		    puts(fn, c)
		    run = 0
		end if
		prev = c
	    end for
	    if run then
		puts(fn, run)
	    end if
	    close(fn)
	end if
    end for
end procedure

function terminate(integer x)
-- handle SIG_TERM termination request from server  
    warnMessage("Sorry, server was busy - try again\n")
    save_words()
    abort(1)
    return 0
end function

-- set up termination handler
atom terminate_addr
terminate_addr = call_back(routine_id("terminate"))

constant SIGTERM = 15

atom libc
libc = open_dll("libc.so")

integer signal

if libc != 0 then
    signal = define_c_proc(libc, "signal", {C_INT, C_INT})
    if signal != -1 then
	c_proc(signal, {SIGTERM, terminate_addr})
    end if
end if

integer current_month, current_year
integer current_file

current_month = -1
current_year = -1
current_file = -1
sequence current_name

function msg_open(sequence filename)
-- open a monthly message file for reading    
    if COMPRESSED then
    	system("zcat " & filename &  ".gz > msgtemp.TXT", 2)
    	return open("msgtemp.TXT", "r")
    else
    	return open(filename, "r")
    end if
end function

function open_month(integer year, integer month)
-- return file handle for a given month's data  
    if year = current_year and month = current_month then
	return current_file 
    end if
    if current_file > 0 then
	close(current_file)
    end if
    current_name = sprintf("%4d%02d.TXT", {year, month}) 
    current_file = msg_open("../mlist/" & current_name)
    if current_file <= 0 or getc(current_file) = -1 then
	current_file = open("../mlist/current.txt", "r")
	if current_file <= 0 then
	    errMessage("Couldn't open current.txt, or " & current_name)
	end if
    end if
    current_month = month
    current_year = year
    return current_file
end function

constant BASE_MONTH = 1996*12 + 6

procedure AddHit(sequence counts, sequence location, integer chars_per_line)
-- record the score for a message
-- keep track of the best scores and messages
    atom score
    integer p, byte_count, age
    
    byte_count = location[4] * chars_per_line
    score = 0
    if length(keywords) = 0 then
	-- we don't sort by message content
	score = 1 
    else
	if byte_count < 20 then
	    return  -- some garbage messages
        end if
	age = location[1]*12 + location[2] - BASE_MONTH
	-- score per word, counts[i] has max 254
	for i = 1 to length(counts) do
	    if counts[i] then
  	        -- first priority: number of *unique* words matched:
	        score += BIG_VALUE * (counts[i] > 0)
	        
	        -- second priority: total number of words matched,
	        --                  full vs. partial matches,
	        --                  adjusted for message length,
	        --                  and slightly favoring newer messages
	        score += sqrt(counts[i] / (4000+10*byte_count)) * 
	                 (1 + .001 * age)
	    end if
	end for
	if score = 0 then
	    return
	end if
    	if thread then
    	    score = BIG_VALUE + 1 -- same for all messages
    	end if
    end if
    
    nhits += 1
    if score < length(counts) * BIG_VALUE then
        npartial += 1
    end if
    if length(top_hits) = 0 then
	top_hits = {{score, location}}
    else    
	p = 0
	for i = length(top_hits) to 1 by -1 do
	    if thread then
	    	if score <= top_hits[i][SCORE] then  -- earliest first
	    	    p = i
	    	    exit
	    	end if
	    else
	    	if score < top_hits[i][SCORE] then  -- latest first
			p = i
			exit
	    	end if
	    end if
	end for
	if p != max_hits then
	    top_hits = append(top_hits[1..p], {score, location}) &
			    top_hits[p+1..length(top_hits)]
	    if length(top_hits) > max_hits then
		top_hits = top_hits[1..length(top_hits)-1]
	    end if
	end if
    end if
end procedure

sequence LETTER
LETTER = repeat(FALSE, 256)
for c = 0 to 255 do
    if (c >= '0' and c <= '9') or 
       (c >= 'a' and c <= 'z') or
       (c >= 'A' and c <= 'Z') then
	LETTER[c+1] = TRUE
    end if
end for

function scan(sequence keywords, integer year, integer month, 
	      integer offset, integer line_count)
-- scan one message for all possible keywords
-- return the scores for the various words - (10,7,5,0) per occurrence
    integer f, n, c, p, p2, len
    boolean white_before, white_after
    object line
    sequence tline
    sequence counts
    object word
    atom count
	
    f = open_month(year, month)

    if seek(f, offset) != 0 then
	errMessage(sprintf("seek to %d failed in %d/%d", 
			    {offset, year, month}))
    end if
    
    scanned += 1
    counts = repeat(0, length(keywords))
    
    line = gets(f)
    if match("X-EUFORUM:", line) then -- ignore it
        line = gets(f)
    end if
    
    for i = 1 to line_count do
	if atom(line) then
	    exit
	end if
	n = length(line)
	
	-- convert line to lower case - in-lined for speed
	for j = 1 to n-1 do
	    c = line[j]
	    if c <= 'Z' then
		if c >= 'A' then
		    c += TO_LOWER
		    line[j] = c 
		end if
	    end if
	end for
	
	for j = 1 to length(counts) do
	    -- count number of occurrences of word - in-lined for speed
	    word = keywords[j]
	    count = 0
	    tline = line
	    while TRUE do
		if atom(word) then
		    exit
		end if
		p = match(word, tline) -- should fail 99% of the time
		if thread then
		    p2 = match("subject:", line)
		    if p2 = 1 then
 	                -- subject line
		        if p > 8 then
		            return {1} -- always just one word
		    	else
		    	    return {0} -- no point in continuing
		    	end if
		    else 
		        p = 0
		    end if
		end if
		if p = 0 then
		    exit
		end if
		len = length(word)
		if p = 1 then
		    white_before = length(tline) = length(line) 
		else 
		    white_before = not LETTER[1+tline[p-1]]
		end if
		if (p + len > length(tline)) or not LETTER[1+tline[p+len]] then
		    white_after = TRUE
		else
		    white_after = FALSE
		end if
		if white_before and white_after then
		    count += 10    -- full credit: complete word match
		elsif len > 3 then  -- short words must match complete words  
		    if white_before or white_after then
			count += 7  -- prefix or suffix
		    else
			count += 5  -- middle of a substring
		    end if
		end if  
		tline = tline[p+len..$]
	    end while
	    counts[j] += count
	end for
	line = gets(f)
    end for
    return counts
end function

function sig_match(sequence word0, sequence signature)
-- return TRUE if word matches the message signature
    integer index, p, bit
    
    for j = 1 to length(word0) do
	-- check if bit is ON
	index = word0[j]
	p = floor(index/8)+1
	bit = power(2, and_bits(index, 7))
	if and_bits(signature[p], bit) = 0 then
	    return FALSE
	end if               
    end for
    return TRUE
end function

integer master
master = open("../mlist/master", "rb")
if master = -1 then
    errMessage("Couldn't open master file")
end if

constant WORD_CACHE_SIZE = 1000  -- retain this many word data files

procedure trim_cache()
-- remove surplus word data files
    integer oldest, n
    sequence word_files
    sequence oldest_date 
    
    word_files = dir("words")
    n = length(word_files)
    while n > WORD_CACHE_SIZE + 2 do
	-- find minimum date
	oldest = 0
	oldest_date = {9999}
	for i = 1 to length(word_files) do
	    if match("word_", word_files[i][D_NAME]) then
		if compare(oldest_date, word_files[i][D_YEAR..D_SECOND]) > 0 then
		    oldest = i             
		    oldest_date = word_files[i][D_YEAR..D_SECOND]
		end if
	    end if
	end for
	if oldest then
	    system("rm \"words/" & word_files[oldest][D_NAME] & '"', 2)     
	    word_files[oldest][D_NAME] = ""
	    n -= 1
	else
	    exit
	end if
    end while
end procedure

constant NUM_MESSAGES = 80000  -- at least this many in database

procedure search()
-- top level search 
    integer month, year, prev_year, offset, c, run, j
    boolean skip, look_for
    integer fn, m, firstm, lastm, line_count, chars_per_line, len, message_num
    integer a, b, index
    sequence lc, off, res, word
    sequence from, signature
    sequence keywords0, possible, s 

    keywords = wild:lower(keywords) -- general lower
    keywords0 = repeat({}, length(keywords))
    for i = 1 to length(keywords) do
	word = keywords[i]
	for k = 1 to length(word)-1 do
	    if word[k] >= 'a' and word[k] <= 'z' and
	       word[k+1] >= 'a' and word[k+1] <= 'z' then
		-- a 2-letter combo - record the index
		a = remainder(word[k], SIG_SIZE)
		b = remainder(word[k+1], SIG_SIZE)
		index = a * SIG_SIZE + b
		keywords0[i] = append(keywords0[i], index)
	    end if
	end for
    end for
    
    postedBy = lower(postedBy)
    firstm = fromYear*12+fromMonth
    lastm = toYear*12+toMonth
    off = repeat(0, 4)
    lc = repeat(0, 3)
    if first_res = 1 then
	html_puts("Searching ...\n")
    end if
    prev_year = -1
    
    -- load up word score data files from previous searches
    dat = repeat(0, length(keywords))
    for i = 1 to length(dat) do
	if thread then
	    fn = -1  -- can't use saved scores
	    log_msg("THREAD")
	else
	    fn = open("words/word_" & keywords[i], "rb")
	end if
	if fn != -1 then
	    log_msg("cache  HIT: " & keywords[i])
	    s = get_bytes(fn, 4)
	    if length(s) = 4 then
		len = bytes_to_int(s)
		dat[i] = repeat(UNKNOWN, len)
		j = 1
		while j <= len do
		    c = getc(fn)
		    if c = EOF then
			log_msg(sprintf(" - incomplete#1: %d/%d", {j, len}))
			exit  -- truncated
		    end if
		    if c = 0 or c = UNKNOWN then
			run = getc(fn)
			if run = EOF then
			    log_msg(sprintf(" - incomplete#2: %d/%d", {j, len}))
			    exit
			end if
			if c = 0 then
			    dat[i][j..j+run-1] = 0
			end if
			j += run
		    else
			dat[i][j] = c
			j += 1
		    end if
		end while
	    else
		-- corrupt file
		dat[i] = repeat(UNKNOWN, NUM_MESSAGES)
	    end if
	    close(fn)
	else
	    if not thread then
	        log_msg("cache MISS: " & keywords[i])
	    end if
	    dat[i] = repeat(UNKNOWN, NUM_MESSAGES)
	end if
    end for
    
    message_num = 0
    while 1 do
	c = getc(master)
	if c = EOF then
	    exit
	end if
	if c = TAG_MONTH then
	    -- month header
	    month = getc(master)
	    year = getc(master) + 1900  
	    c = getc(master)
	    if c = EOF then
		errMessage("unexpected EOF")
	    end if
	    m = year*12+month
	    if m >= firstm and m <= lastm then
		skip = FALSE
		if first_res = 1 and year != prev_year then
		    html_printf("%d ...\n", year)
		    flush(1)
		    prev_year = year
		end if
	    else
		skip = TRUE 
	    end if  
	end if
	if c != TAG_FROM then
	    errMessage("expected TAG_FROM")
	end if
	from = gets(master)
	c = getc(master)
	if c != TAG_SIGNATURE then
	    errMessage("expected TAG_SIGNATURE")
	end if
	signature = get_bytes(master, SIG_LEN)
	c = getc(master)
	if c != TAG_OFFSET then
	    errMessage("expected TAG_OFFSET")
	end if
	-- read offset
	off[1] = getc(master)
	off[2] = getc(master)
	off[3] = getc(master)
	off[4] = getc(master)
	
	-- read line count
	lc[1] = getc(master)
	lc[2] = getc(master)
	lc[3] = getc(master)
	
	chars_per_line = getc(master)
	
	message_num += 1
	
	if not skip then
	    -- date is ok
	    totalCount += 1 -- number of messages in date range
	    if length(postedBy) = 0 or match(postedBy, from) then
		-- poster is ok
		possible = repeat(0, length(keywords))
		look_for = FALSE
		for i = 1 to length(keywords) do
		    if message_num > length(dat[i]) then
			dat[i] &= repeat(UNKNOWN, 500 + message_num - length(dat[i]))
		    end if
		    if dat[i][message_num] = UNKNOWN then
			if sig_match(keywords0[i], signature) then
			    -- must scan for this word in the message
			    possible[i] = keywords[i]
			    look_for = TRUE
			else
			    -- no scan, score is 0
			    dat[i][message_num] = 0
			end if
		    end if
		end for
		
		offset = bytes_to_int(off)
		line_count = lc[1] + 256 * lc[2] + 65536 * lc[3]
		
		if look_for then
		    -- we have to scan this message for certain words
		    res = scan(possible, year, month, offset, line_count)
		    
		    for i = 1 to length(keywords) do
			if sequence(possible[i]) then
			    -- record what we learned
			    if res[i] >= UNKNOWN then
				res[i] = UNKNOWN-1 -- score per word is max 254
			    end if
			    dat[i][message_num] = res[i]        
			else
			    res[i] = dat[i][message_num]
			end if
		    end for
		else        
		    -- no words to scan for
		    res = repeat(0, length(keywords))
		    for i = 1 to length(keywords) do
			res[i] = dat[i][message_num]
		    end for
		end if
		AddHit(res, {year, month, offset, line_count}, chars_per_line)
	    end if
	end if
    end while

    -- save word data information
    save_words()
end procedure

procedure top_link(sequence percent, sequence name, sequence url, sequence w)
-- HTML for one top link    
	html_puts("<td bgcolor=\"#FFCC66\" width=\"" & percent &
		  "%\" align=center height=15 valign=bottom>\n")
	html_puts("<font face=\"verdana, arial, geneva\" size=-2>\n")
	html_puts("<a class=\"toplink\" href=\"" & url & "\"><b>" & name &
		  "</b></a></font></td>\n")
	html_puts("<td bgcolor=\"#FFCC66\" width=\"1%\" valign=top align=right>\n")
	html_puts("<img src=\"topcr.gif\" width=5 height=5></td>\n")
	if length(w) > 0 then
		html_puts("<td bgcolor=\"#FFFFFF\" width=" & w & 
			  "><img src=\"dum.gif\" width=1 height=1></td>\n")
	end if
end procedure

procedure top_links()
-- display the top links
	html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
	html_puts("<tr valign=top>\n")
	
	top_link("7", "Home", "index.html", "1")
	top_link("16", "What Is Euphoria?", "hotnew.htm", "1")
	top_link("14", "Documentation", "manual.htm", "1")
	top_link("7", "News", "news.htm", "1")
	top_link("14", "EUforum", "listserv.htm", "1")
	top_link("17", "Download Euphoria", "v20.htm", "1")
	top_link("18", "Donate", "reg.htm", "")
	
	html_puts("</tr>\n")
	html_puts("<tr>\n")
	
	html_puts("<td colspan=2 bgcolor=\"#FFCC66\" height=3>\n")
	html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
	
	for i = 1 to 6 do
		html_puts("<td bgcolor=\"#FFFFFF\"><img src=\"dum.gif\" width=1 height=1></td>\n")
		html_puts("<td colspan=2 bgcolor=\"#FFCC66\">\n")
		html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
	end for
	
	html_puts("</tr>\n")
	html_puts("<tr>\n")
	html_puts("<td colspan=20 bgcolor=\"#FFFFFF\" height=3>\n")
	html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
	html_puts("</tr>\n")
 
	html_puts("</table>\n")

	html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
	html_puts("<tr valign=top>\n")
	
	top_link("23", "Recent User Contributions", "contrib.htm", "1")
	top_link("12", "The Archive", "archive.htm", "1")
	top_link("22", "Other Euphoria Web Sites", "othersit.htm", "1") 
	top_link("16", "RDS Development", "contract.htm", "1")
	top_link("22", "Related Books & Software", "books.htm", "")
	
	html_puts("</tr>\n")
	html_puts("<tr>\n")
	html_puts("<td colspan=2 bgcolor=\"#FFCC66\" height=3>\n")
	
	for i = 1 to 4 do
		html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
		html_puts("<td bgcolor=\"#FFFFFF\"><img src=\"dum.gif\" width=1 height=1></td>\n")
		html_puts("<td colspan=2 bgcolor=\"#FFCC66\">\n")
	end for
	
	html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
	html_puts("</tr>\n")
 
	html_puts("</table>\n")
end procedure

procedure htmlHeader1()
-- First batch of HTML  
    html_puts("Content-type: text/html\n\n")
    html_puts("<html><head>\n")
    html_puts("<title>search results for EUforum</title>\n")
    html_puts("<base href=\"http://www.RapidEuphoria.com/\">\n")
    html_puts("<link REL=StyleSheet HREF=\"global.css\" TYPE=\"text/css\" MEDIA=screen>\n")
    html_puts("</head>\n")
    html_puts("<body bgcolor=\"#FFFFFF\" link=\"#003366\"" & 
	    " vlink=\"#006699\" text=\"#000000\">\n")
    html_puts("<basefont size=3>\n\n")

    top_links()
    
    html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
    html_puts("<tr valign=top>\n")
    html_puts("<td width=\"90%\"></td>\n")
    html_puts("<td></td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td colspan=2 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td colspan=2 bgcolor=\"#CCCC99\"><img src=\"dum.gif\" width=1 height=1></td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td colspan=2 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td align=center>\n")
    html_puts("<font face=\"Arial, Helvetica\" color=\"#CC3366\" size=+2><b>\n")
    html_puts("Search Results</b></font></td>\n")
    html_puts("<td></td>\n")
    html_puts("</tr>\n")
    html_puts("</table>\n")
end procedure

constant month_codes = "123456789ABC"
	 
constant months = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
		   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"} 

sequence keystring

procedure htmlHeader2()
-- second batch of HTML
    sequence selected
    
    html_puts("<p>\n")
    html_puts("<form method=GET action=\"http://www.OpenEuphoria.org/cgi-bin/esearch.exu\">\n\n")

    html_puts("<table border=1 cellpadding=0 cellspacing=0 width=\"100%\">\n")
    html_puts("<tr bgcolor=\"#FFFFEE\">\n")
    html_puts("<td>\n")
    html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
    html_puts("<tr>\n")
    html_puts("<td width=\"5%\"></td>\n")
    html_puts("<td width=\"17%\"></td>\n")
    html_puts("<td width=\"17%\"></td>\n")
    html_puts("<td width=\"22%\"></td>\n")
    html_puts("<td width=\"28%\"></td>\n")
    html_puts("<td width=\"11%\"></td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td colspan=6 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td></td>\n")
    html_puts("<td colspan=3><font face=\"Arial, Helvetica\" size=3 color=\"#990033\">\n")
    html_puts("<b>EUforum Search</b></font>\n")
    html_puts("</td>\n")
    html_puts("<td colspan=2 align=center>\n")
    html_puts("<font face=\"Comic Sans MS\" size=-2><i>\n")
    html_puts("<a href=\"esearch.txt\">Powered by Euphoria</a></i></font>\n")
    html_puts("</td>\n")
    html_puts("</tr>\n")
    html_puts("<tr>\n")
    html_puts("<td colspan=6 height=7><img src=\"dum.gif\" width=1 height=1></td></tr>\n")
    html_puts("<tr>\n")
    html_puts("<td></td>\n")
    html_puts("<td><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
    html_puts("Starting:<br>\n")
    html_puts("<select name=\"fromMonth\">\n")
    for i = 1 to length(month_codes) do
	if fromMonth = i then
	    selected = "selected"
	else
	    selected = ""
	end if
	html_printf("<option value=\"%s\" %s>%s\n", 
	       {month_codes[i], selected, months[i]})
    end for
    html_puts("</select>\n")
  
    html_puts("<select name=\"fromYear\">\n")
    
    for i = 1996 to the_date[1] do 
	if fromYear = i then
	    selected = "selected"
	else
	    selected = ""
	end if
	html_printf("<option value=\"%s\" %s>%d\n", {M_CODE[i-1995], selected, i})
    end for
    
    html_puts("</select>\n")
    html_puts("</font></td>\n")
	    
    html_puts("<td><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
    html_puts("Ending:<br>\n")

    html_puts("<select name=\"toMonth\">\n")
    for i = 1 to length(month_codes) do
	if toMonth = i then
	    selected = "selected"
	else
	    selected = ""
	end if
	html_printf("<option value=\"%s\" %s>%s\n", 
	       {month_codes[i], selected, months[i]})
    end for
    html_puts("</select>\n")

    html_puts("<select name=\"toYear\">\n")
    for i = 1996 to the_date[1] + (toYear > the_date[1]) do
	if toYear = i then
	    selected = "selected"
	else
	    selected = ""
	end if
	html_printf("<option value=\"%s\" %s>%d\n", {M_CODE[i-1995], selected, i})
    end for
    html_puts("</select>\n")
    
    html_puts("</font></td>\n")

    html_puts("<td><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
    html_puts("Posted by:<br>\n")
    html_puts("<input type=\"text\" name=\"postedBy\" size=20 ")
    html_printf("value=\"%s\"", {postedBy})
    html_puts(">\n</font></td>\n")
    puts(1,
"<td valign=top><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
    html_puts("Keywords:<br>\n")
    html_puts("<input type=\"text\" name=\"keywords\" size=25 ")
    html_puts("value=\"")

    for j = 1 to length(keystring) do
    	if keystring[j] = '"' then
    	    html_puts(""")
    	else
    	    html_puts(keystring[j])
        end if
    end for
    
    html_puts("\"></font></td>\n")

    html_puts("<td><input type=\"submit\" value=\"Search!\"></td>\n")
    html_puts("</tr>\n")
	
    html_puts("<tr>\n")
    html_puts("<td colspan=6 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
    html_puts("</tr>\n")
    html_puts("</table>\n")
    html_puts("</td>\n")
    html_puts("</tr>\n")

    html_puts("</table>\n\n")
    html_puts("</form>\n\n")
    html_puts("<p>\n")
end procedure

function from_line(sequence low_line)
-- Is this the FROM: line? 
    integer f, b
    
    f = match("from:", low_line)
    if f = 0 then
	return FALSE
    end if
    b = find('>', low_line)
    if b = 0 then
	return TRUE
    end if
    return f <= 4 and f < b  
end function

boolean made_bold

function insert_bold(sequence line, sequence keyword, integer small)
-- mark all occurrences of keyword with <b>...</b> 
-- note: after inserting <b>...</b> we'll have a glitch if
-- a subsequent keyword happens to be 'b' 
    integer p, p1
    integer len 
    boolean good, white_before, white_after
    sequence low_line
    
    low_line = lower(line)
    len = length(keyword)
    p = 1
    while 1 do
	p1 = match(keyword, low_line[p..length(low_line)]) 
	if p1 = 0 then
	    exit
	end if
	p += p1 - 1 
	if len <= small then     
	    if p = 1 or not LETTER[1+low_line[p-1]] then
		white_before = TRUE
	    else
		white_before = FALSE    
	    end if
	    if p+len > length(low_line) or not LETTER[1+low_line[p+len]] then
		white_after = TRUE
	    else
		white_after = FALSE
	    end if
	    if white_before and white_after then
		good = TRUE
	    else 
		good = FALSE
		p += len
	    end if
	else
	    good = TRUE
	end if
	if good then
            made_bold = TRUE
	    line = line[1..p-1] & "<b>" & line[p..p+len-1] & "</b>" &
		   line[p+len..length(line)]
	    low_line = lower(line)
	    p += 3+len+4
	end if
    end while
    return line
end function

boolean found_subject
sequence subject

procedure subject_underline(sequence line, integer year, integer month)
-- add the href to the subject line
    integer start
    integer m1, m2, y1, y2
    
    if line[length(line)] = '\n' then
        line = line[1..length(line)-1]
    end if
    start = find(':', line)+1  -- must be there
    while start <= length(line) do
    	if not find(line[start], " \t\n\r") then
    	    exit
    	end if
    	start += 1
    end while
    html_puts(line[1..start-1])
    html_puts("<a href=http://www.OpenEuphoria.org/cgi-bin/esearch.exu?")
    y1 = year
    m1 = month - 1
    if m1 = 0 then
        m1 = 12
        y1 -= 1
    end if
    y2 = year
    m2 = month+1
    if m2 = 13 then
        m2 = 1
        y2 += 1
    end if
    html_printf("thread=1&fromMonth=%s&fromYear=%s&toMonth=%s&toYear=%s",
                {M_CODE[m1], M_CODE[y1-1995], M_CODE[m2], M_CODE[y2-1995]})
    html_puts("&keywords=\"" & subject & "\">")
    if thread then
        html_puts("<b>")
    end if
    html_puts(line[start..length(line)])
    if thread then
        html_puts("</b>")
    end if
    html_puts("</a>\n")
end procedure

function blank_plus(sequence s)
-- replace blanks by plus signs
-- and various other characters by hex codes
    sequence new_s 
    new_s = ""
    for i = 1 to length(s) do
        if find(s[i], " \t\r") then
            new_s &= '+'
        elsif s[i] = '?' then
            new_s &= "%3F"
        elsif s[i] = '&' then
            new_s &= "%26"
        elsif s[i] = '%' then
            new_s &= "%25"
        elsif s[i] = '+' then
            new_s &= "%2B"
        elsif s[i] = '#' then
            new_s &= "%23"
        else
            new_s &= s[i]
        end if
    end for
    return new_s
end function

function delete_re(sequence s)
-- remove all "Re:" etc
    integer r
    sequence sl
    
    sl = lower(s)
    while TRUE do
        r = match("re:", lower(s))
        if r then
            s = s[1..r-1] & s[r+3..length(s)]
        else
            exit
        end if
    end while
    return s
end function

function set_subject(sequence line)
-- set the subject words, minus any "Re:" etc.
    integer substart
    
    if not found_subject then
    	substart = match("subject:", lower(line))
        if substart = 1 then
	    found_subject = TRUE
            subject = line[substart+8..length(line)]
            subject = delete_re(subject)
            subject = delete_leading_white(delete_trailing_white(subject))
            subject = blank_plus(subject)
            return TRUE
        else
            return FALSE
        end if
    end if
    return FALSE -- never reached???
end function

boolean found_from

procedure show_bold(sequence text, integer year, integer month)
-- display one line with keywords in bold 
    sequence low_line
    sequence line
    boolean sub
    
    sub = set_subject(text)
    
    line = ""

    for i = 1 to length(text) do
	-- replace < and >
	if text[i] = '<' then
	    line &= "<"
	elsif text[i] = '>' then
	    line &= ">"
	else 
	    line &= text[i]
	end if
    end for
    low_line = lower(line)
    
    if not thread then
    	if length(postedBy) and not found_from and from_line(low_line) then
	    line = insert_bold(line, postedBy, 0)
	    found_from = TRUE
    	else 
	    for k = 1 to length(keywords) do
	    	line = insert_bold(line, keywords[k], 3)
	    end for
    	end if
    end if	
    if sub then
        subject_underline(line, year, month)
    else
        html_puts(line)
    end if
end procedure

procedure printTabHead()
-- HTML for table of messages   
    html_puts("<table border=0 width=655 cellpadding=0 cellspacing=0>\n")
    html_puts("<tr>\n")
    html_puts("<td width=450 nowrap></td>\n")
    html_puts("<td width=150 nowrap></td>\n")
    html_puts("</tr>\n")
end procedure

procedure printResult()
-- output the entries that match
    integer count, t, tabSize, numLeft, numIteration, e, m
    integer prev_t, chop, fn, year, month, offset, line_count, fscore
    atom score
    boolean no_hit_in_msg
    object line, euforum_num
    sequence s, match_type
    
    if length(top_hits) = 0 then
	errMessage("No match found. Try again.")    
	html_puts("\n</body></html>\n")
	return
    end if
  
    no_hit_in_msg = FALSE
    
    html_puts("<p><font face=\"verdana, arial, geneva\" size=-1 color=\"#333333\">")
    
    if thread then
    	if nhits = 1 then
    	    s = ""
    	else
    	    s = "s"
    	end if
    	html_printf("%d message%s in thread", {nhits,s})
    else
	if length(keywords) > 1 then
	    html_puts("exactly ")
	end if
	html_printf("matched %d", nhits-npartial)
	if npartial > 0 then
	    html_printf(", partially matched %d", npartial) 
	end if
	html_printf(" of %d messages in date range", totalCount)
    end if
  
    tabSize = OUT_CHUNK_SIZE -- number of entries in one <tab> table for speed
    if nhits - first_res + 1 < max_per_page then
	numLeft = nhits - first_res + 1 
    else
	numLeft = max_per_page
    end if
    if nhits > max_per_page then
	html_printf(" - displaying results %d to %d", 
		{first_res, first_res + numLeft - 1})
    end if
    html_puts("</font>\n<p>\n")
    
    t = first_res
    count = 0
    while numLeft > 0 do 
	if numLeft <= tabSize then
	    numIteration = numLeft
	    numLeft = 0
	else 
	    numIteration = tabSize
	    numLeft -= tabSize
	end if
	
	-- do a set in one table so browser can start printing it 
	printTabHead()
	for k = 1 to numIteration do
	    count += 1
	    found_from = FALSE
	    found_subject = FALSE
	    
	    score = top_hits[t][SCORE]
	    year = top_hits[t][LOCATION][1]
	    month = top_hits[t][LOCATION][2]
	    fn = open_month(year, month)
	    if fn <= 0 then
		return
	    end if
	    offset = top_hits[t][LOCATION][3]
	    if seek(fn, offset) != 0 then
		errMessage(sprintf("seek to %d failed in %d/%d", 
			    {offset, year, month}))
	    end if
	    line_count = top_hits[t][LOCATION][4]
	    
	    line = gets(fn)
	    euforum_num = {}
	    
    	    m = match("X-EUFORUM: ", line)
	    if m then
	        if year*12 + month >= 
	           the_date[1]*12 + the_date[2] - 3 then
	            -- fairly recent, might be in EUforum
	    	    line = line[m+11..length(line)]
	    	    euforum_num = value(line)
	    	    if euforum_num[1] = GET_SUCCESS then
	    	        euforum_num = euforum_num[2]
	    	        e = open(sprintf("../EUforum/m%d.html", euforum_num), "r") 	
	    	        if e != -1 then
	    	            close(e)
	    	        else
	    	            euforum_num = {}  -- message not available anymore
	    	        end if
	    	    end if
	    	end if
      	        line = gets(fn) -- skip to real first line
	    end if	
	    
	    fscore = floor(score/BIG_VALUE)
	    if fscore = length(keywords) then
	        match_type = "exact match"
	    else
	        match_type = "partial match"
	    end if
	    if atom(euforum_num) then
	        html_printf("<tr bgcolor=\"#40CC40\"><td>" & 
	          "<font face=ARIAL size=2 color=#FFFFFF> <b># %d", t)
	        
	        if length(keywords) > 1 then
	            html_printf("  %s %d/%d", 
	                    {match_type, fscore, length(keywords)})
	        end if   
	        html_printf("</b></font></td>\n<td align=right><font face=ARIAL size=1 " &
	          "color=#FFFFFF>view this message in " &
	          "<a class=\"euforum\" href=http://www.OpenEuphoria.org/EUforum/m%d.html>" & 
	          "EUforum</a>  </font></td></tr>\n", euforum_num)
            else
	        html_printf("<tr bgcolor=\"#40CC40\"><td colspan=2>" & 
	          "<font face=ARIAL size=2 color=#FFFFFF> <b># %d", t)
	        
	        if length(keywords) > 1 then
	            html_printf("  %s %d/%d", 
	                    {match_type, fscore, length(keywords)})
	        end if
	        html_puts("</b></font></td></tr>\n")
	    end if
	    
	    if remainder(count, 2) = 1 then
		html_puts("<tr><td colspan=2 bgcolor=\"#FFCCFF\"><pre>")
	    else
		html_puts("<tr><td colspan=2 bgcolor=\"#CCCCFF\"><pre>")
	    end if
	    
	    made_bold = FALSE
	    for i = 1 to line_count do
		if atom(line) then
		    exit
		end if
		while length(line) > LINE_WIDTH do
		    chop = LINE_WIDTH
		    for j = 1 to 7 do
			if find(line[chop], " \t\n\r,.)]") then
			    exit
			end if
			chop -= 1
		    end for
		    if chop = LINE_WIDTH-7 then
			chop = LINE_WIDTH
		    end if
		    
		    show_bold(line[1..chop], year, month)
		    html_puts('\n')
		    line = line[chop+1..length(line)]
		end while
		show_bold(line, year, month)
		line = gets(fn)
	    end for
	    if not made_bold then
	        no_hit_in_msg = TRUE -- something is wrong
	    end if
	    html_puts("</pre></td></tr>\n")
	    prev_t = t
	    t += 1
	end for
	html_puts("\n</table>\n")
    end while
    
    html_puts("<p> <br>\n")
    html_puts("<font face=\"verdana, arial, geneva\" size=2 color=\"#333333\">")
    count = nhits - first_res + 1  
    if nhits > max_per_page then        
	-- more than one page of results
	chop = match("&first_res=", query)
	if chop then
	    -- remove old one
	    while TRUE do
		query = query[1..chop-1] & query[chop+1..length(query)] 
		if chop > length(query) or query[chop] = '&' then
		    exit
		end if
	    end while
	end if
	count -= max_per_page
	html_puts("<center>")
	if first_res > max_per_page then
	    html_puts("<= <a href=http://www.OpenEuphoria.org/cgi-bin/esearch.exu?" & query)
	    html_printf("&first_res=%d>Previous %d",
		      {first_res - max_per_page, max_per_page})
	    html_puts(" Results</a>\n") 
	end if
	if count > 0 then
	    -- more results coming after this page
	    html_puts("    <a href=http://www.OpenEuphoria.org/cgi-bin/esearch.exu?" & query)
	    if count > max_per_page then
		html_printf("&first_res=%d>Next %d",
			  {first_res + max_per_page, max_per_page})
	    else
		html_printf("&first_res=%d\">Final %d",
		    {first_res + max_per_page, count})
	    end if
	    html_puts(" Results</a> =>") 
	end if
	html_puts("</center></font>\n")
    else
	html_puts("<center>End of Search Results</center></font>\n")
    end if
    html_puts("\n<p> </body></html>\n")

    if no_hit_in_msg and not thread and length(keywords) then
        -- probably a word-wrap issue, could be on any page of results
        -- viewed by the user
        
        -- system("echo \"" & query & "\" > ex.err", 2)
        -- system(
        -- "mail -s \"esearch.exu no_hit_in_msg!\" rds@RapidEuphoria.com < ex.err > /dev/null", 
        -- 2) 
    end if
end procedure

function hex_char(integer c)
-- is c a valid hex character?
    return find(c, "0123456789ABCDEFabcdef")
end function

function hex_val(integer c)
-- return value of a hex character  
    if c >= 'A' and c <= 'F' then
	return 10 + c - 'A'
    elsif c >= 'a' and c <= 'f' then
	return 10 + c - 'a'
    else 
	return c - '0'
    end if
end function

function parse_input(sequence s)
-- crack the syntax sent from Web browser: aaa=bbb&ccc=ddd&...
-- Convert to {{"aaa", "bbb"}, {"ccc", "ddd"}, ...} left-right pairs
    integer i, c
    sequence word_pairs, left_word, right_word
    
    word_pairs = {}
    i = 1
    s &= {0,0,0} -- end markers
    while s[i] != 0 do
    left_word = ""
    while 1 do
	-- build left word
	c = s[i]
	if c = '=' or c = '&' or c = 0 then
	    exit
	end if
	if c = '%' and hex_char(s[i+1]) and hex_char(s[i+2]) then
	    c = 16 * hex_val(s[i+1]) + hex_val(s[i+2])
	    i += 2
	elsif c = '+' then
	    c = ' '
	end if
	left_word &= c
	i += 1
    end while
    
    i += 1  
    right_word = ""
    while 1 do
	-- build right word
	c = s[i]
	if c = '&' or c = 0 then
	    exit
	end if
	if c = '%' and hex_char(s[i+1]) and hex_char(s[i+2]) then
	    c = 16 * hex_val(s[i+1]) + hex_val(s[i+2])
	    i += 2
	elsif c = '+' then
	    c = ' '
	end if
	right_word &= c
	i += 1
    end while
    i += 1
    word_pairs = append(word_pairs, {left_word, right_word})
    end while
    return word_pairs
end function

function getKeywords()
-- get values from the CGI query string, e.g.
-- fromMonth=6&fromYear=1&toMonth=3&toYear=7&postedBy=rds&keywords=apple+orange 
    sequence key, pairs, var, val
    boolean inquote
    
    query = getenv("QUERY_STRING")
    if atom(query) then
	query = getenv("query_string")
	if atom(query) then 
	    errMessage("Internal Error - no query_string")
	end if
    end if

    log_msg(sprintf("%d-%d-%d %d:%02d\n%s", 
	       append(the_date[1..5], query)))
	
    pairs = parse_input(query)
    
    -- defaults in case of corrupted query string
    fromMonth = 6
    fromYear = 1996
    toMonth = the_date[2]
    toYear = the_date[1]
    keystring = ""
    postedBy = ""
    first_res = 1
    for i = 1 to length(pairs) do
	var = lower(pairs[i][1])
	val = pairs[i][2]
	
	if equal(var, "frommonth") and length(val) then
	    if val[1] >= 'A' then
		fromMonth = val[1] - 'A' + 10
	    else
		fromMonth = val[1] - '0'    
	    end if
	
	elsif equal(var, "fromyear") and length(val) then
	    if val[1] >= 'A' then
		fromYear = 1995 + val[1] - 'A' + 10
	    else
		fromYear = 1995 + val[1] - '0'  
	    end if
	
	elsif equal(var, "tomonth") and length(val) then
	    if val[1] >= 'A' then
		toMonth = val[1] - 'A' + 10
	    else
		toMonth = val[1] - '0'  
	    end if
	
	elsif equal(var, "toyear") and length(val) then
	    if val[1] >= 'A' then
		toYear = 1995 + val[1] - 'A' + 10
	    else
		toYear = 1995 + val[1] - '0'    
	    end if
	
	elsif equal(var, "postedby") then
	    postedBy = val
	    while length(postedBy) > 0 and 
		(postedBy[1] = ' ' or postedBy[1] = '\t') do
		postedBy = postedBy[2..length(postedBy)]
	    end while
	
	elsif equal(var, "thread") then
	    thread = TRUE
	  
	elsif equal(var, "keywords") then
	    keystring = val 
	
	elsif equal(var, "first_res") then
	    first_res = value(val)
	    if first_res[1] = GET_SUCCESS then
		first_res = first_res[2]
		max_hits += first_res-1
	    else
		first_res = 1
	    end if
	
	end if
    end for

    if fromYear = 1996 and fromMonth < 6 then
	fromMonth = 6
    end if  

    keywords = {}

    if fromYear * 100 +fromMonth > toYear*100 + toMonth then
	return
	"Starting month/year must be earlier or equal to ending month/year"
    end if
    
    -- make list of keywords from keystring
    key = ""
    if thread then
        -- just one string to search for (and it might contain double-quotes)
        if length(keystring) and keystring[1]='"' then
            keystring = keystring[2..$]
        end if
            
        if length(keystring) and keystring[$] = '"' then
            keystring = keystring[1..$-1]
        end if
    
        if length(keystring) then
            keywords = {keystring}
    	else
    	    keywords = {}
        end if
    else
        keystring &= ' '
        inquote = FALSE
        for i = 1 to length(keystring) do
	    if keystring[i] = '"' then
	        inquote = not inquote
	    	
	    elsif not inquote and keystring[i] = ' ' then
	        if length(key) then
		    keywords = append(keywords, key)
		    key = ""
	        end if
	
	    elsif not find(keystring[i], "`'&") then
	        key = append(key, keystring[i])
	 
	    end if
        end for
    
        keystring = keystring[1..length(keystring)-1]
    end if
    
    return ""
end function
    
sequence msg
    
htmlHeader1()

msg = getKeywords()

htmlHeader2()

if length(msg) > 0 then
    errMessage(msg)
end if

flush(1)

search()

log_msg(sprintf("search completed, time: %.2f", time()-t0))

printResult()

flush(1)

stats()

if not thread and remainder(the_date[5], 10) = 0 then
    trim_cache()
end if