#!./exu
-- How to use EUforum Search
--
-- Searches are not case-sensitive.
--
-- Matching messages are displayed in order of score. Logical connectives
-- such as "and" and "or" are not supported, but the scoring system always
-- favors messages that contain many different search words, over messages
-- that contain the same search word many times. For example, if you search
-- for:
-- object oriented
--
-- then messages that contain both "object" and "oriented" (an exact match)
-- will always rank higher than messages that contain only one of these words,
-- even where there are many occurrences of that one word.
--
-- The results will indicate the number of exact matches, versus the
-- number of partial matches, and the exact matches will be listed first.
--
-- To search for a phrase, put double-quotes around the words in the phrase.
-- e.g "Euphoria programming language"
--
-- Specifying "Posted by" will limit the search to messages where the
-- "From:" line contains the string that you provide. If no keywords are
-- specified, the messages by that poster will be sorted most recent first.
--
-- If you specify neither "Posted By" nor "Keywords", you'll get a
-- chronological listing of messages in the chosen date range, most
-- recent message first.
--
-- To extract all the messages for a given thread, click on the Subject
-- line of any message in that thread. The complete thread will be
-- displayed in chronological order.
--
-- Recent messages in the search results (past week or so) can also be viewed
-- on the EUforum message board. Just click the link
-- "view this message in EUforum". This will also make it easier to
-- reply to the message.
--
-- How it works internally:
--
-- Several years of EUforum messages (well over 100 Mb) are stored
-- in monthly files on OpenEuphoria.org and are updated immediately
-- when a new message is posted.
--
-- The search is speeded up by having a master index file of all messages.
-- It contains the poster's name and a special "signature" that records
-- all the 2-letter combinations that exist in that message. The signature
-- consists of 676 bits (26x26). When searching for a keyword, only the
-- messages that have all the necessary 2-letter combinations
-- contained in that keyword are searched.
--
-- The search is also speeded up by keeping a cache of the most recent
-- words that have been searched for, with all of the scoring information
-- for those words. This is particularly helpful when the user moves
-- to the next page of results, and also when he performs a modified
-- search using some of the same words as before.
without type_check
include machine.e
include file.e
include get.e
include wildcard.e as wild
include dll.e
-- one-byte tags for master index file
constant TAG_MONTH = 1,
TAG_SIGNATURE = 2,
TAG_FROM = 3,
TAG_OFFSET = 4
constant SIG_SIZE = 26 -- a..z
constant SIG_LEN = floor(SIG_SIZE * SIG_SIZE / 8)+1
constant TRUE = 1, FALSE = 0
constant TO_LOWER = 'a' - 'A'
constant SCORE = 1, LOCATION = 2 -- for top_hits
constant OUT_CHUNK_SIZE = 5 -- number of messages to output per table
constant LINE_WIDTH = 85 -- wrap output lines
constant UNKNOWN = 255
constant BIG_VALUE = 1000 -- large value for matching a search word
constant EOF = -1
constant COMPRESSED = FALSE -- is monthly data compressed?
constant M_CODE="123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -- one-char codes month & year
type boolean(integer x)
return x = TRUE or x = FALSE
end type
atom t0
t0 = time()
sequence top_hits -- the best messages so far
top_hits = {}
sequence the_date
the_date = date()
the_date[1] += 1900
object query
query = ""
integer fromMonth, fromYear, toMonth, toYear
sequence postedBy, keywords
boolean thread -- is this a search to find members of a thread?
thread = FALSE
object first_res
integer max_hits -- max number of matching messages to collect
max_hits = 25
integer max_per_page
max_per_page = 25
integer nhits, npartial, totalCount, scanned
nhits = 0
npartial = 0
totalCount = 0
scanned = 0
integer log_file
log_file = -1 -- open("esearch.log", "a")
function crash(object x)
-- in case of fire ...
integer errfile
errfile = open("ex.err", "a")
puts(errfile, "\nquery is: " & query & '\n')
close(errfile)
-- send an e-mail containing ex.err
system("mail -s \"esearch.exu crashed!\" rds@RapidEuphoria.com < ex.err > /dev/null", 2)
return 0
end function
crash_routine(routine_id("crash"))
procedure log_msg(sequence text)
-- record a message in the log
if log_file != -1 then
puts(log_file, text)
puts(log_file, '\n')
flush(log_file)
end if
end procedure
function lower(sequence s)
-- (quickly) convert a line to lower case
integer c
for i = 1 to length(s) do
c = s[i]
if c <= 'Z' then
if c >= 'A' then
s[i] = c + TO_LOWER
end if
end if
end for
return s
end function
function delete_leading_white(sequence s)
-- delete leading whitespace
while length(s) and find(s[1], " \t\n\r") do
s = s[2..length(s)]
end while
return s
end function
function delete_trailing_white(sequence s)
-- delete trailing whitespace
while length(s) and find(s[length(s)], " \t\n\r") do
s = s[1..length(s)-1]
end while
return s
end function
procedure html_puts(object s)
-- write out some HTML
puts(1, s)
end procedure
procedure html_printf(sequence format, object s)
-- write out some formatted HTML
printf(1, format, s)
end procedure
procedure stats()
-- save some stats for performance analysis
log_msg(sprintf("matched %d of %d, scanned %d, time: %.2f\n",
{nhits, totalCount, scanned, time()-t0}))
end procedure
procedure warnMessage(sequence msg)
-- issue a message but don't quit
html_puts("<p><font face=\"verdana, arial, geneva\" size=-1 color=\"#333333\">")
html_printf("%s </font>\n</body></html>\n", {msg})
flush(1)
log_msg(msg)
stats()
end procedure
procedure errMessage(sequence msg)
-- issue a fatal error message and quit
warnMessage(msg)
abort(1)
end procedure
sequence dat
procedure save_words()
-- save scoring info to the word cache
integer fn, c, run, prev
if thread then
return -- we don't have the correct score, only 3 months anyway
end if
for i = 1 to length(keywords) do
fn = open("words/word_" & keywords[i], "wb")
if fn != -1 then
puts(fn, int_to_bytes(length(dat[i])))
run = 0
prev = -1
-- use run-length compression on 0's and UNKNOWN's
for j = 1 to length(dat[i]) do
c = dat[i][j]
if c = 0 or c = UNKNOWN then
if prev = c then
run += 1
if run = 256 then
puts(fn, 255)
puts(fn, c)
run = 1
end if
else
if run then
puts(fn, run)
end if
puts(fn, c)
run = 1
end if
else
if run then
puts(fn, run)
end if
puts(fn, c)
run = 0
end if
prev = c
end for
if run then
puts(fn, run)
end if
close(fn)
end if
end for
end procedure
function terminate(integer x)
-- handle SIG_TERM termination request from server
warnMessage("Sorry, server was busy - try again\n")
save_words()
abort(1)
return 0
end function
-- set up termination handler
atom terminate_addr
terminate_addr = call_back(routine_id("terminate"))
constant SIGTERM = 15
atom libc
libc = open_dll("libc.so")
integer signal
if libc != 0 then
signal = define_c_proc(libc, "signal", {C_INT, C_INT})
if signal != -1 then
c_proc(signal, {SIGTERM, terminate_addr})
end if
end if
integer current_month, current_year
integer current_file
current_month = -1
current_year = -1
current_file = -1
sequence current_name
function msg_open(sequence filename)
-- open a monthly message file for reading
if COMPRESSED then
system("zcat " & filename & ".gz > msgtemp.TXT", 2)
return open("msgtemp.TXT", "r")
else
return open(filename, "r")
end if
end function
function open_month(integer year, integer month)
-- return file handle for a given month's data
if year = current_year and month = current_month then
return current_file
end if
if current_file > 0 then
close(current_file)
end if
current_name = sprintf("%4d%02d.TXT", {year, month})
current_file = msg_open("../mlist/" & current_name)
if current_file <= 0 or getc(current_file) = -1 then
current_file = open("../mlist/current.txt", "r")
if current_file <= 0 then
errMessage("Couldn't open current.txt, or " & current_name)
end if
end if
current_month = month
current_year = year
return current_file
end function
constant BASE_MONTH = 1996*12 + 6
procedure AddHit(sequence counts, sequence location, integer chars_per_line)
-- record the score for a message
-- keep track of the best scores and messages
atom score
integer p, byte_count, age
byte_count = location[4] * chars_per_line
score = 0
if length(keywords) = 0 then
-- we don't sort by message content
score = 1
else
if byte_count < 20 then
return -- some garbage messages
end if
age = location[1]*12 + location[2] - BASE_MONTH
-- score per word, counts[i] has max 254
for i = 1 to length(counts) do
if counts[i] then
-- first priority: number of *unique* words matched:
score += BIG_VALUE * (counts[i] > 0)
-- second priority: total number of words matched,
-- full vs. partial matches,
-- adjusted for message length,
-- and slightly favoring newer messages
score += sqrt(counts[i] / (4000+10*byte_count)) *
(1 + .001 * age)
end if
end for
if score = 0 then
return
end if
if thread then
score = BIG_VALUE + 1 -- same for all messages
end if
end if
nhits += 1
if score < length(counts) * BIG_VALUE then
npartial += 1
end if
if length(top_hits) = 0 then
top_hits = {{score, location}}
else
p = 0
for i = length(top_hits) to 1 by -1 do
if thread then
if score <= top_hits[i][SCORE] then -- earliest first
p = i
exit
end if
else
if score < top_hits[i][SCORE] then -- latest first
p = i
exit
end if
end if
end for
if p != max_hits then
top_hits = append(top_hits[1..p], {score, location}) &
top_hits[p+1..length(top_hits)]
if length(top_hits) > max_hits then
top_hits = top_hits[1..length(top_hits)-1]
end if
end if
end if
end procedure
sequence LETTER
LETTER = repeat(FALSE, 256)
for c = 0 to 255 do
if (c >= '0' and c <= '9') or
(c >= 'a' and c <= 'z') or
(c >= 'A' and c <= 'Z') then
LETTER[c+1] = TRUE
end if
end for
function scan(sequence keywords, integer year, integer month,
integer offset, integer line_count)
-- scan one message for all possible keywords
-- return the scores for the various words - (10,7,5,0) per occurrence
integer f, n, c, p, p2, len
boolean white_before, white_after
object line
sequence tline
sequence counts
object word
atom count
f = open_month(year, month)
if seek(f, offset) != 0 then
errMessage(sprintf("seek to %d failed in %d/%d",
{offset, year, month}))
end if
scanned += 1
counts = repeat(0, length(keywords))
line = gets(f)
if match("X-EUFORUM:", line) then -- ignore it
line = gets(f)
end if
for i = 1 to line_count do
if atom(line) then
exit
end if
n = length(line)
-- convert line to lower case - in-lined for speed
for j = 1 to n-1 do
c = line[j]
if c <= 'Z' then
if c >= 'A' then
c += TO_LOWER
line[j] = c
end if
end if
end for
for j = 1 to length(counts) do
-- count number of occurrences of word - in-lined for speed
word = keywords[j]
count = 0
tline = line
while TRUE do
if atom(word) then
exit
end if
p = match(word, tline) -- should fail 99% of the time
if thread then
p2 = match("subject:", line)
if p2 = 1 then
-- subject line
if p > 8 then
return {1} -- always just one word
else
return {0} -- no point in continuing
end if
else
p = 0
end if
end if
if p = 0 then
exit
end if
len = length(word)
if p = 1 then
white_before = length(tline) = length(line)
else
white_before = not LETTER[1+tline[p-1]]
end if
if (p + len > length(tline)) or not LETTER[1+tline[p+len]] then
white_after = TRUE
else
white_after = FALSE
end if
if white_before and white_after then
count += 10 -- full credit: complete word match
elsif len > 3 then -- short words must match complete words
if white_before or white_after then
count += 7 -- prefix or suffix
else
count += 5 -- middle of a substring
end if
end if
tline = tline[p+len..$]
end while
counts[j] += count
end for
line = gets(f)
end for
return counts
end function
function sig_match(sequence word0, sequence signature)
-- return TRUE if word matches the message signature
integer index, p, bit
for j = 1 to length(word0) do
-- check if bit is ON
index = word0[j]
p = floor(index/8)+1
bit = power(2, and_bits(index, 7))
if and_bits(signature[p], bit) = 0 then
return FALSE
end if
end for
return TRUE
end function
integer master
master = open("../mlist/master", "rb")
if master = -1 then
errMessage("Couldn't open master file")
end if
constant WORD_CACHE_SIZE = 1000 -- retain this many word data files
procedure trim_cache()
-- remove surplus word data files
integer oldest, n
sequence word_files
sequence oldest_date
word_files = dir("words")
n = length(word_files)
while n > WORD_CACHE_SIZE + 2 do
-- find minimum date
oldest = 0
oldest_date = {9999}
for i = 1 to length(word_files) do
if match("word_", word_files[i][D_NAME]) then
if compare(oldest_date, word_files[i][D_YEAR..D_SECOND]) > 0 then
oldest = i
oldest_date = word_files[i][D_YEAR..D_SECOND]
end if
end if
end for
if oldest then
system("rm \"words/" & word_files[oldest][D_NAME] & '"', 2)
word_files[oldest][D_NAME] = ""
n -= 1
else
exit
end if
end while
end procedure
constant NUM_MESSAGES = 80000 -- at least this many in database
procedure search()
-- top level search
integer month, year, prev_year, offset, c, run, j
boolean skip, look_for
integer fn, m, firstm, lastm, line_count, chars_per_line, len, message_num
integer a, b, index
sequence lc, off, res, word
sequence from, signature
sequence keywords0, possible, s
keywords = wild:lower(keywords) -- general lower
keywords0 = repeat({}, length(keywords))
for i = 1 to length(keywords) do
word = keywords[i]
for k = 1 to length(word)-1 do
if word[k] >= 'a' and word[k] <= 'z' and
word[k+1] >= 'a' and word[k+1] <= 'z' then
-- a 2-letter combo - record the index
a = remainder(word[k], SIG_SIZE)
b = remainder(word[k+1], SIG_SIZE)
index = a * SIG_SIZE + b
keywords0[i] = append(keywords0[i], index)
end if
end for
end for
postedBy = lower(postedBy)
firstm = fromYear*12+fromMonth
lastm = toYear*12+toMonth
off = repeat(0, 4)
lc = repeat(0, 3)
if first_res = 1 then
html_puts("Searching ...\n")
end if
prev_year = -1
-- load up word score data files from previous searches
dat = repeat(0, length(keywords))
for i = 1 to length(dat) do
if thread then
fn = -1 -- can't use saved scores
log_msg("THREAD")
else
fn = open("words/word_" & keywords[i], "rb")
end if
if fn != -1 then
log_msg("cache HIT: " & keywords[i])
s = get_bytes(fn, 4)
if length(s) = 4 then
len = bytes_to_int(s)
dat[i] = repeat(UNKNOWN, len)
j = 1
while j <= len do
c = getc(fn)
if c = EOF then
log_msg(sprintf(" - incomplete#1: %d/%d", {j, len}))
exit -- truncated
end if
if c = 0 or c = UNKNOWN then
run = getc(fn)
if run = EOF then
log_msg(sprintf(" - incomplete#2: %d/%d", {j, len}))
exit
end if
if c = 0 then
dat[i][j..j+run-1] = 0
end if
j += run
else
dat[i][j] = c
j += 1
end if
end while
else
-- corrupt file
dat[i] = repeat(UNKNOWN, NUM_MESSAGES)
end if
close(fn)
else
if not thread then
log_msg("cache MISS: " & keywords[i])
end if
dat[i] = repeat(UNKNOWN, NUM_MESSAGES)
end if
end for
message_num = 0
while 1 do
c = getc(master)
if c = EOF then
exit
end if
if c = TAG_MONTH then
-- month header
month = getc(master)
year = getc(master) + 1900
c = getc(master)
if c = EOF then
errMessage("unexpected EOF")
end if
m = year*12+month
if m >= firstm and m <= lastm then
skip = FALSE
if first_res = 1 and year != prev_year then
html_printf("%d ...\n", year)
flush(1)
prev_year = year
end if
else
skip = TRUE
end if
end if
if c != TAG_FROM then
errMessage("expected TAG_FROM")
end if
from = gets(master)
c = getc(master)
if c != TAG_SIGNATURE then
errMessage("expected TAG_SIGNATURE")
end if
signature = get_bytes(master, SIG_LEN)
c = getc(master)
if c != TAG_OFFSET then
errMessage("expected TAG_OFFSET")
end if
-- read offset
off[1] = getc(master)
off[2] = getc(master)
off[3] = getc(master)
off[4] = getc(master)
-- read line count
lc[1] = getc(master)
lc[2] = getc(master)
lc[3] = getc(master)
chars_per_line = getc(master)
message_num += 1
if not skip then
-- date is ok
totalCount += 1 -- number of messages in date range
if length(postedBy) = 0 or match(postedBy, from) then
-- poster is ok
possible = repeat(0, length(keywords))
look_for = FALSE
for i = 1 to length(keywords) do
if message_num > length(dat[i]) then
dat[i] &= repeat(UNKNOWN, 500 + message_num - length(dat[i]))
end if
if dat[i][message_num] = UNKNOWN then
if sig_match(keywords0[i], signature) then
-- must scan for this word in the message
possible[i] = keywords[i]
look_for = TRUE
else
-- no scan, score is 0
dat[i][message_num] = 0
end if
end if
end for
offset = bytes_to_int(off)
line_count = lc[1] + 256 * lc[2] + 65536 * lc[3]
if look_for then
-- we have to scan this message for certain words
res = scan(possible, year, month, offset, line_count)
for i = 1 to length(keywords) do
if sequence(possible[i]) then
-- record what we learned
if res[i] >= UNKNOWN then
res[i] = UNKNOWN-1 -- score per word is max 254
end if
dat[i][message_num] = res[i]
else
res[i] = dat[i][message_num]
end if
end for
else
-- no words to scan for
res = repeat(0, length(keywords))
for i = 1 to length(keywords) do
res[i] = dat[i][message_num]
end for
end if
AddHit(res, {year, month, offset, line_count}, chars_per_line)
end if
end if
end while
-- save word data information
save_words()
end procedure
procedure top_link(sequence percent, sequence name, sequence url, sequence w)
-- HTML for one top link
html_puts("<td bgcolor=\"#FFCC66\" width=\"" & percent &
"%\" align=center height=15 valign=bottom>\n")
html_puts("<font face=\"verdana, arial, geneva\" size=-2>\n")
html_puts("<a class=\"toplink\" href=\"" & url & "\"><b>" & name &
"</b></a></font></td>\n")
html_puts("<td bgcolor=\"#FFCC66\" width=\"1%\" valign=top align=right>\n")
html_puts("<img src=\"topcr.gif\" width=5 height=5></td>\n")
if length(w) > 0 then
html_puts("<td bgcolor=\"#FFFFFF\" width=" & w &
"><img src=\"dum.gif\" width=1 height=1></td>\n")
end if
end procedure
procedure top_links()
-- display the top links
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr valign=top>\n")
top_link("7", "Home", "index.html", "1")
top_link("16", "What Is Euphoria?", "hotnew.htm", "1")
top_link("14", "Documentation", "manual.htm", "1")
top_link("7", "News", "news.htm", "1")
top_link("14", "EUforum", "listserv.htm", "1")
top_link("17", "Download Euphoria", "v20.htm", "1")
top_link("18", "Donate", "reg.htm", "")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\" height=3>\n")
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
for i = 1 to 6 do
html_puts("<td bgcolor=\"#FFFFFF\"><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\">\n")
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
end for
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=20 bgcolor=\"#FFFFFF\" height=3>\n")
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr valign=top>\n")
top_link("23", "Recent User Contributions", "contrib.htm", "1")
top_link("12", "The Archive", "archive.htm", "1")
top_link("22", "Other Euphoria Web Sites", "othersit.htm", "1")
top_link("16", "RDS Development", "contract.htm", "1")
top_link("22", "Related Books & Software", "books.htm", "")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\" height=3>\n")
for i = 1 to 4 do
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td bgcolor=\"#FFFFFF\"><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\">\n")
end for
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
end procedure
procedure htmlHeader1()
-- First batch of HTML
html_puts("Content-type: text/html\n\n")
html_puts("<html><head>\n")
html_puts("<title>search results for EUforum</title>\n")
html_puts("<base href=\"http://www.RapidEuphoria.com/\">\n")
html_puts("<link REL=StyleSheet HREF=\"global.css\" TYPE=\"text/css\" MEDIA=screen>\n")
html_puts("</head>\n")
html_puts("<body bgcolor=\"#FFFFFF\" link=\"#003366\"" &
" vlink=\"#006699\" text=\"#000000\">\n")
html_puts("<basefont size=3>\n\n")
top_links()
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr valign=top>\n")
html_puts("<td width=\"90%\"></td>\n")
html_puts("<td></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 bgcolor=\"#CCCC99\"><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td align=center>\n")
html_puts("<font face=\"Arial, Helvetica\" color=\"#CC3366\" size=+2><b>\n")
html_puts("Search Results</b></font></td>\n")
html_puts("<td></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
end procedure
constant month_codes = "123456789ABC"
constant months = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}
sequence keystring
procedure htmlHeader2()
-- second batch of HTML
sequence selected
html_puts("<p>\n")
html_puts("<form method=GET action=\"http://www.OpenEuphoria.org/cgi-bin/esearch.exu\">\n\n")
html_puts("<table border=1 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr bgcolor=\"#FFFFEE\">\n")
html_puts("<td>\n")
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr>\n")
html_puts("<td width=\"5%\"></td>\n")
html_puts("<td width=\"17%\"></td>\n")
html_puts("<td width=\"17%\"></td>\n")
html_puts("<td width=\"22%\"></td>\n")
html_puts("<td width=\"28%\"></td>\n")
html_puts("<td width=\"11%\"></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=6 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td></td>\n")
html_puts("<td colspan=3><font face=\"Arial, Helvetica\" size=3 color=\"#990033\">\n")
html_puts("<b>EUforum Search</b></font>\n")
html_puts("</td>\n")
html_puts("<td colspan=2 align=center>\n")
html_puts("<font face=\"Comic Sans MS\" size=-2><i>\n")
html_puts("<a href=\"esearch.txt\">Powered by Euphoria</a></i></font>\n")
html_puts("</td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=6 height=7><img src=\"dum.gif\" width=1 height=1></td></tr>\n")
html_puts("<tr>\n")
html_puts("<td></td>\n")
html_puts("<td><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
html_puts("Starting:<br>\n")
html_puts("<select name=\"fromMonth\">\n")
for i = 1 to length(month_codes) do
if fromMonth = i then
selected = "selected"
else
selected = ""
end if
html_printf("<option value=\"%s\" %s>%s\n",
{month_codes[i], selected, months[i]})
end for
html_puts("</select>\n")
html_puts("<select name=\"fromYear\">\n")
for i = 1996 to the_date[1] do
if fromYear = i then
selected = "selected"
else
selected = ""
end if
html_printf("<option value=\"%s\" %s>%d\n", {M_CODE[i-1995], selected, i})
end for
html_puts("</select>\n")
html_puts("</font></td>\n")
html_puts("<td><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
html_puts("Ending:<br>\n")
html_puts("<select name=\"toMonth\">\n")
for i = 1 to length(month_codes) do
if toMonth = i then
selected = "selected"
else
selected = ""
end if
html_printf("<option value=\"%s\" %s>%s\n",
{month_codes[i], selected, months[i]})
end for
html_puts("</select>\n")
html_puts("<select name=\"toYear\">\n")
for i = 1996 to the_date[1] + (toYear > the_date[1]) do
if toYear = i then
selected = "selected"
else
selected = ""
end if
html_printf("<option value=\"%s\" %s>%d\n", {M_CODE[i-1995], selected, i})
end for
html_puts("</select>\n")
html_puts("</font></td>\n")
html_puts("<td><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
html_puts("Posted by:<br>\n")
html_puts("<input type=\"text\" name=\"postedBy\" size=20 ")
html_printf("value=\"%s\"", {postedBy})
html_puts(">\n</font></td>\n")
puts(1,
"<td valign=top><font face=\"Arial, Helvetica\" size=-2 color=\"#990033\">\n")
html_puts("Keywords:<br>\n")
html_puts("<input type=\"text\" name=\"keywords\" size=25 ")
html_puts("value=\"")
for j = 1 to length(keystring) do
if keystring[j] = '"' then
html_puts(""")
else
html_puts(keystring[j])
end if
end for
html_puts("\"></font></td>\n")
html_puts("<td><input type=\"submit\" value=\"Search!\"></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=6 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
html_puts("</td>\n")
html_puts("</tr>\n")
html_puts("</table>\n\n")
html_puts("</form>\n\n")
html_puts("<p>\n")
end procedure
function from_line(sequence low_line)
-- Is this the FROM: line?
integer f, b
f = match("from:", low_line)
if f = 0 then
return FALSE
end if
b = find('>', low_line)
if b = 0 then
return TRUE
end if
return f <= 4 and f < b
end function
boolean made_bold
function insert_bold(sequence line, sequence keyword, integer small)
-- mark all occurrences of keyword with <b>...</b>
-- note: after inserting <b>...</b> we'll have a glitch if
-- a subsequent keyword happens to be 'b'
integer p, p1
integer len
boolean good, white_before, white_after
sequence low_line
low_line = lower(line)
len = length(keyword)
p = 1
while 1 do
p1 = match(keyword, low_line[p..length(low_line)])
if p1 = 0 then
exit
end if
p += p1 - 1
if len <= small then
if p = 1 or not LETTER[1+low_line[p-1]] then
white_before = TRUE
else
white_before = FALSE
end if
if p+len > length(low_line) or not LETTER[1+low_line[p+len]] then
white_after = TRUE
else
white_after = FALSE
end if
if white_before and white_after then
good = TRUE
else
good = FALSE
p += len
end if
else
good = TRUE
end if
if good then
made_bold = TRUE
line = line[1..p-1] & "<b>" & line[p..p+len-1] & "</b>" &
line[p+len..length(line)]
low_line = lower(line)
p += 3+len+4
end if
end while
return line
end function
boolean found_subject
sequence subject
procedure subject_underline(sequence line, integer year, integer month)
-- add the href to the subject line
integer start
integer m1, m2, y1, y2
if line[length(line)] = '\n' then
line = line[1..length(line)-1]
end if
start = find(':', line)+1 -- must be there
while start <= length(line) do
if not find(line[start], " \t\n\r") then
exit
end if
start += 1
end while
html_puts(line[1..start-1])
html_puts("<a href=http://www.OpenEuphoria.org/cgi-bin/esearch.exu?")
y1 = year
m1 = month - 1
if m1 = 0 then
m1 = 12
y1 -= 1
end if
y2 = year
m2 = month+1
if m2 = 13 then
m2 = 1
y2 += 1
end if
html_printf("thread=1&fromMonth=%s&fromYear=%s&toMonth=%s&toYear=%s",
{M_CODE[m1], M_CODE[y1-1995], M_CODE[m2], M_CODE[y2-1995]})
html_puts("&keywords=\"" & subject & "\">")
if thread then
html_puts("<b>")
end if
html_puts(line[start..length(line)])
if thread then
html_puts("</b>")
end if
html_puts("</a>\n")
end procedure
function blank_plus(sequence s)
-- replace blanks by plus signs
-- and various other characters by hex codes
sequence new_s
new_s = ""
for i = 1 to length(s) do
if find(s[i], " \t\r") then
new_s &= '+'
elsif s[i] = '?' then
new_s &= "%3F"
elsif s[i] = '&' then
new_s &= "%26"
elsif s[i] = '%' then
new_s &= "%25"
elsif s[i] = '+' then
new_s &= "%2B"
elsif s[i] = '#' then
new_s &= "%23"
else
new_s &= s[i]
end if
end for
return new_s
end function
function delete_re(sequence s)
-- remove all "Re:" etc
integer r
sequence sl
sl = lower(s)
while TRUE do
r = match("re:", lower(s))
if r then
s = s[1..r-1] & s[r+3..length(s)]
else
exit
end if
end while
return s
end function
function set_subject(sequence line)
-- set the subject words, minus any "Re:" etc.
integer substart
if not found_subject then
substart = match("subject:", lower(line))
if substart = 1 then
found_subject = TRUE
subject = line[substart+8..length(line)]
subject = delete_re(subject)
subject = delete_leading_white(delete_trailing_white(subject))
subject = blank_plus(subject)
return TRUE
else
return FALSE
end if
end if
return FALSE -- never reached???
end function
boolean found_from
procedure show_bold(sequence text, integer year, integer month)
-- display one line with keywords in bold
sequence low_line
sequence line
boolean sub
sub = set_subject(text)
line = ""
for i = 1 to length(text) do
-- replace < and >
if text[i] = '<' then
line &= "<"
elsif text[i] = '>' then
line &= ">"
else
line &= text[i]
end if
end for
low_line = lower(line)
if not thread then
if length(postedBy) and not found_from and from_line(low_line) then
line = insert_bold(line, postedBy, 0)
found_from = TRUE
else
for k = 1 to length(keywords) do
line = insert_bold(line, keywords[k], 3)
end for
end if
end if
if sub then
subject_underline(line, year, month)
else
html_puts(line)
end if
end procedure
procedure printTabHead()
-- HTML for table of messages
html_puts("<table border=0 width=655 cellpadding=0 cellspacing=0>\n")
html_puts("<tr>\n")
html_puts("<td width=450 nowrap></td>\n")
html_puts("<td width=150 nowrap></td>\n")
html_puts("</tr>\n")
end procedure
procedure printResult()
-- output the entries that match
integer count, t, tabSize, numLeft, numIteration, e, m
integer prev_t, chop, fn, year, month, offset, line_count, fscore
atom score
boolean no_hit_in_msg
object line, euforum_num
sequence s, match_type
if length(top_hits) = 0 then
errMessage("No match found. Try again.")
html_puts("\n</body></html>\n")
return
end if
no_hit_in_msg = FALSE
html_puts("<p><font face=\"verdana, arial, geneva\" size=-1 color=\"#333333\">")
if thread then
if nhits = 1 then
s = ""
else
s = "s"
end if
html_printf("%d message%s in thread", {nhits,s})
else
if length(keywords) > 1 then
html_puts("exactly ")
end if
html_printf("matched %d", nhits-npartial)
if npartial > 0 then
html_printf(", partially matched %d", npartial)
end if
html_printf(" of %d messages in date range", totalCount)
end if
tabSize = OUT_CHUNK_SIZE -- number of entries in one <tab> table for speed
if nhits - first_res + 1 < max_per_page then
numLeft = nhits - first_res + 1
else
numLeft = max_per_page
end if
if nhits > max_per_page then
html_printf(" - displaying results %d to %d",
{first_res, first_res + numLeft - 1})
end if
html_puts("</font>\n<p>\n")
t = first_res
count = 0
while numLeft > 0 do
if numLeft <= tabSize then
numIteration = numLeft
numLeft = 0
else
numIteration = tabSize
numLeft -= tabSize
end if
-- do a set in one table so browser can start printing it
printTabHead()
for k = 1 to numIteration do
count += 1
found_from = FALSE
found_subject = FALSE
score = top_hits[t][SCORE]
year = top_hits[t][LOCATION][1]
month = top_hits[t][LOCATION][2]
fn = open_month(year, month)
if fn <= 0 then
return
end if
offset = top_hits[t][LOCATION][3]
if seek(fn, offset) != 0 then
errMessage(sprintf("seek to %d failed in %d/%d",
{offset, year, month}))
end if
line_count = top_hits[t][LOCATION][4]
line = gets(fn)
euforum_num = {}
m = match("X-EUFORUM: ", line)
if m then
if year*12 + month >=
the_date[1]*12 + the_date[2] - 3 then
-- fairly recent, might be in EUforum
line = line[m+11..length(line)]
euforum_num = value(line)
if euforum_num[1] = GET_SUCCESS then
euforum_num = euforum_num[2]
e = open(sprintf("../EUforum/m%d.html", euforum_num), "r")
if e != -1 then
close(e)
else
euforum_num = {} -- message not available anymore
end if
end if
end if
line = gets(fn) -- skip to real first line
end if
fscore = floor(score/BIG_VALUE)
if fscore = length(keywords) then
match_type = "exact match"
else
match_type = "partial match"
end if
if atom(euforum_num) then
html_printf("<tr bgcolor=\"#40CC40\"><td>" &
"<font face=ARIAL size=2 color=#FFFFFF> <b># %d", t)
if length(keywords) > 1 then
html_printf(" %s %d/%d",
{match_type, fscore, length(keywords)})
end if
html_printf("</b></font></td>\n<td align=right><font face=ARIAL size=1 " &
"color=#FFFFFF>view this message in " &
"<a class=\"euforum\" href=http://www.OpenEuphoria.org/EUforum/m%d.html>" &
"EUforum</a> </font></td></tr>\n", euforum_num)
else
html_printf("<tr bgcolor=\"#40CC40\"><td colspan=2>" &
"<font face=ARIAL size=2 color=#FFFFFF> <b># %d", t)
if length(keywords) > 1 then
html_printf(" %s %d/%d",
{match_type, fscore, length(keywords)})
end if
html_puts("</b></font></td></tr>\n")
end if
if remainder(count, 2) = 1 then
html_puts("<tr><td colspan=2 bgcolor=\"#FFCCFF\"><pre>")
else
html_puts("<tr><td colspan=2 bgcolor=\"#CCCCFF\"><pre>")
end if
made_bold = FALSE
for i = 1 to line_count do
if atom(line) then
exit
end if
while length(line) > LINE_WIDTH do
chop = LINE_WIDTH
for j = 1 to 7 do
if find(line[chop], " \t\n\r,.)]") then
exit
end if
chop -= 1
end for
if chop = LINE_WIDTH-7 then
chop = LINE_WIDTH
end if
show_bold(line[1..chop], year, month)
html_puts('\n')
line = line[chop+1..length(line)]
end while
show_bold(line, year, month)
line = gets(fn)
end for
if not made_bold then
no_hit_in_msg = TRUE -- something is wrong
end if
html_puts("</pre></td></tr>\n")
prev_t = t
t += 1
end for
html_puts("\n</table>\n")
end while
html_puts("<p> <br>\n")
html_puts("<font face=\"verdana, arial, geneva\" size=2 color=\"#333333\">")
count = nhits - first_res + 1
if nhits > max_per_page then
-- more than one page of results
chop = match("&first_res=", query)
if chop then
-- remove old one
while TRUE do
query = query[1..chop-1] & query[chop+1..length(query)]
if chop > length(query) or query[chop] = '&' then
exit
end if
end while
end if
count -= max_per_page
html_puts("<center>")
if first_res > max_per_page then
html_puts("<= <a href=http://www.OpenEuphoria.org/cgi-bin/esearch.exu?" & query)
html_printf("&first_res=%d>Previous %d",
{first_res - max_per_page, max_per_page})
html_puts(" Results</a>\n")
end if
if count > 0 then
-- more results coming after this page
html_puts(" <a href=http://www.OpenEuphoria.org/cgi-bin/esearch.exu?" & query)
if count > max_per_page then
html_printf("&first_res=%d>Next %d",
{first_res + max_per_page, max_per_page})
else
html_printf("&first_res=%d\">Final %d",
{first_res + max_per_page, count})
end if
html_puts(" Results</a> =>")
end if
html_puts("</center></font>\n")
else
html_puts("<center>End of Search Results</center></font>\n")
end if
html_puts("\n<p> </body></html>\n")
if no_hit_in_msg and not thread and length(keywords) then
-- probably a word-wrap issue, could be on any page of results
-- viewed by the user
-- system("echo \"" & query & "\" > ex.err", 2)
-- system(
-- "mail -s \"esearch.exu no_hit_in_msg!\" rds@RapidEuphoria.com < ex.err > /dev/null",
-- 2)
end if
end procedure
function hex_char(integer c)
-- is c a valid hex character?
return find(c, "0123456789ABCDEFabcdef")
end function
function hex_val(integer c)
-- return value of a hex character
if c >= 'A' and c <= 'F' then
return 10 + c - 'A'
elsif c >= 'a' and c <= 'f' then
return 10 + c - 'a'
else
return c - '0'
end if
end function
function parse_input(sequence s)
-- crack the syntax sent from Web browser: aaa=bbb&ccc=ddd&...
-- Convert to {{"aaa", "bbb"}, {"ccc", "ddd"}, ...} left-right pairs
integer i, c
sequence word_pairs, left_word, right_word
word_pairs = {}
i = 1
s &= {0,0,0} -- end markers
while s[i] != 0 do
left_word = ""
while 1 do
-- build left word
c = s[i]
if c = '=' or c = '&' or c = 0 then
exit
end if
if c = '%' and hex_char(s[i+1]) and hex_char(s[i+2]) then
c = 16 * hex_val(s[i+1]) + hex_val(s[i+2])
i += 2
elsif c = '+' then
c = ' '
end if
left_word &= c
i += 1
end while
i += 1
right_word = ""
while 1 do
-- build right word
c = s[i]
if c = '&' or c = 0 then
exit
end if
if c = '%' and hex_char(s[i+1]) and hex_char(s[i+2]) then
c = 16 * hex_val(s[i+1]) + hex_val(s[i+2])
i += 2
elsif c = '+' then
c = ' '
end if
right_word &= c
i += 1
end while
i += 1
word_pairs = append(word_pairs, {left_word, right_word})
end while
return word_pairs
end function
function getKeywords()
-- get values from the CGI query string, e.g.
-- fromMonth=6&fromYear=1&toMonth=3&toYear=7&postedBy=rds&keywords=apple+orange
sequence key, pairs, var, val
boolean inquote
query = getenv("QUERY_STRING")
if atom(query) then
query = getenv("query_string")
if atom(query) then
errMessage("Internal Error - no query_string")
end if
end if
log_msg(sprintf("%d-%d-%d %d:%02d\n%s",
append(the_date[1..5], query)))
pairs = parse_input(query)
-- defaults in case of corrupted query string
fromMonth = 6
fromYear = 1996
toMonth = the_date[2]
toYear = the_date[1]
keystring = ""
postedBy = ""
first_res = 1
for i = 1 to length(pairs) do
var = lower(pairs[i][1])
val = pairs[i][2]
if equal(var, "frommonth") and length(val) then
if val[1] >= 'A' then
fromMonth = val[1] - 'A' + 10
else
fromMonth = val[1] - '0'
end if
elsif equal(var, "fromyear") and length(val) then
if val[1] >= 'A' then
fromYear = 1995 + val[1] - 'A' + 10
else
fromYear = 1995 + val[1] - '0'
end if
elsif equal(var, "tomonth") and length(val) then
if val[1] >= 'A' then
toMonth = val[1] - 'A' + 10
else
toMonth = val[1] - '0'
end if
elsif equal(var, "toyear") and length(val) then
if val[1] >= 'A' then
toYear = 1995 + val[1] - 'A' + 10
else
toYear = 1995 + val[1] - '0'
end if
elsif equal(var, "postedby") then
postedBy = val
while length(postedBy) > 0 and
(postedBy[1] = ' ' or postedBy[1] = '\t') do
postedBy = postedBy[2..length(postedBy)]
end while
elsif equal(var, "thread") then
thread = TRUE
elsif equal(var, "keywords") then
keystring = val
elsif equal(var, "first_res") then
first_res = value(val)
if first_res[1] = GET_SUCCESS then
first_res = first_res[2]
max_hits += first_res-1
else
first_res = 1
end if
end if
end for
if fromYear = 1996 and fromMonth < 6 then
fromMonth = 6
end if
keywords = {}
if fromYear * 100 +fromMonth > toYear*100 + toMonth then
return
"Starting month/year must be earlier or equal to ending month/year"
end if
-- make list of keywords from keystring
key = ""
if thread then
-- just one string to search for (and it might contain double-quotes)
if length(keystring) and keystring[1]='"' then
keystring = keystring[2..$]
end if
if length(keystring) and keystring[$] = '"' then
keystring = keystring[1..$-1]
end if
if length(keystring) then
keywords = {keystring}
else
keywords = {}
end if
else
keystring &= ' '
inquote = FALSE
for i = 1 to length(keystring) do
if keystring[i] = '"' then
inquote = not inquote
elsif not inquote and keystring[i] = ' ' then
if length(key) then
keywords = append(keywords, key)
key = ""
end if
elsif not find(keystring[i], "`'&") then
key = append(key, keystring[i])
end if
end for
keystring = keystring[1..length(keystring)-1]
end if
return ""
end function
sequence msg
htmlHeader1()
msg = getKeywords()
htmlHeader2()
if length(msg) > 0 then
errMessage(msg)
end if
flush(1)
search()
log_msg(sprintf("search completed, time: %.2f", time()-t0))
printResult()
flush(1)
stats()
if not thread and remainder(the_date[5], 10) = 0 then
trim_cache()
end if