<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-10029294</id><updated>2012-01-04T10:11:31.622-08:00</updated><title type='text'>Conventional Spam Filter</title><subtitle type='html'>Just a Conventional Spam Filter for studing spam related problems and try to solve some using the defined methods.</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default?max-results=100'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>15</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>100</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-10029294.post-115695449714033363</id><published>2006-08-30T09:14:00.000-07:00</published><updated>2006-08-30T09:14:57.146-07:00</updated><title type='text'></title><content type='html'>&lt;A HREF='http://photos1.blogger.com/blogger/445/152/640/Vaish_Shiva_Marriage%20169.jpg'&gt;&lt;IMG SRC='http://photos1.blogger.com/blogger/445/152/320/Vaish_Shiva_Marriage%20169.jpg' border=0 alt='' style='cursor:hand'&gt;&lt;/A&gt;&amp;nbsp;&amp;nbsp;&lt;a href='http://picasa.google.com/blogger/' target='ext'&gt;&lt;img src='http://photos1.blogger.com/pbp.gif' alt='Posted by Picasa' style='border: 0px none ; padding: 0px; background: transparent none repeat scroll 0% 50%; -moz-background-clip: initial; -moz-background-origin: initial; -moz-background-inline-policy: initial;' align='middle' border='0' /&gt;&lt;/a&gt; &lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-115695449714033363?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/115695449714033363/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=115695449714033363' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/115695449714033363'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/115695449714033363'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2006/08/blog-post.html' title=''/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-111143247572536281</id><published>2005-03-21T11:14:00.000-08:00</published><updated>2005-03-21T11:14:35.726-08:00</updated><title type='text'>A Statistical Approach to the Spam Problem | Linux Journal</title><content type='html'>&lt;a href="http://www.linuxjournal.com/article/6467"&gt;A Statistical Approach to the Spam Problem | Linux Journal&lt;/a&gt;: "This article discusses one of many possible mathematical foundations for a key aspect of spam&lt;br /&gt;filtering--generating an indicator of ``spamminess'' from a collection of tokens representing the content of&lt;br /&gt;an e-mail."&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-111143247572536281?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/111143247572536281/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=111143247572536281' title='5 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/111143247572536281'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/111143247572536281'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/03/statistical-approach-to-spam-problem.html' title='A Statistical Approach to the Spam Problem | Linux Journal'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>5</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110859468466760687</id><published>2005-02-16T14:58:00.000-08:00</published><updated>2005-02-16T14:58:04.666-08:00</updated><title type='text'>jgc's spam and anti-spam newsletter #7  </title><content type='html'>SESSION 1&lt;br /&gt;&lt;br /&gt;  9:00  Bill Yerazunis       Unified Model of Spam Filtration    (0.17)&lt;br /&gt;  9:20  Eugene Koontz        Bayesian Phishing Classification   (18.37)&lt;br /&gt;  9:40* Jonathan Zdziarski   Bayesian Noise Reduction           (39.02)&lt;br /&gt;10:00* Jonathan Oliver      Lexicographical Distancing         (58.05)&lt;br /&gt;&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-morning1-8&lt;br /&gt;0k.ram&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-morning1-2&lt;br /&gt;20k.ram&lt;br /&gt;&lt;br /&gt;SESSION 2&lt;br /&gt;&lt;br /&gt;10:40* Richard Segal et al  Classifier Aggregation              (0.20)&lt;br /&gt;11:00* Jim Fenton           Message vs. User Authentication    (19.50)&lt;br /&gt;11:20  Rui Dai et al        Regulation                         (39.50)&lt;br /&gt;11:40* Oscar Boykin         Personal Email Network Structure (1:00.15)&lt;br /&gt;&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-morning2-8&lt;br /&gt;0k.ram&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-morning2-2&lt;br /&gt;20k.ram&lt;br /&gt;&lt;br /&gt;SESSION 3&lt;br /&gt;&lt;br /&gt;13:40  Brian McWilliams     Spam Kings                         (0.15)&lt;br /&gt;14:00  John Graham-Cumming  People and Spam                   (19.45)&lt;br /&gt;14:20* Constance Bommelaer  French Government and Spam        (39.05)&lt;br /&gt;14:40* Matthew Prince       Project Honeypot                (1:00.15)&lt;br /&gt;15:00* Jon Praed            Jeremy Jaynes Spam Trial        (1:19.40)&lt;br /&gt;&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-afternoon1&lt;br /&gt;-80k.ram&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-afternoon1&lt;br /&gt;-220k.ram&lt;br /&gt;&lt;br /&gt;SESSION 4&lt;br /&gt;&lt;br /&gt;15:40  Gordon Cormack       Standardized Filter Evaluation   (0.20)&lt;br /&gt;16:00* Dave Mazieres        Mail Avenger                    (25.20)&lt;br /&gt;&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-afternoon2&lt;br /&gt;-80k.ram&lt;br /&gt;http://web.mit.edu/webcast/spamconf05/spam-conference-21jan05-afternoon2&lt;br /&gt;-220k.ram&lt;br /&gt; &lt;br /&gt;- &lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110859468466760687?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110859468466760687/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110859468466760687' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110859468466760687'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110859468466760687'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/02/jgcs-spam-and-anti-spam-newsletter-7.html' title='jgc&apos;s spam and anti-spam newsletter #7  '/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110746261583254450</id><published>2005-02-03T12:30:00.000-08:00</published><updated>2005-02-03T12:30:15.833-08:00</updated><title type='text'>ConSpam Power Point Presentation</title><content type='html'>&lt;a href="http://www.geocities.com/from_india_withlove/ConSpam.ppt"&gt;Conventional Spam Filter Presentation&lt;/a&gt; by Vaishnavi.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110746261583254450?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110746261583254450/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110746261583254450' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110746261583254450'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110746261583254450'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/02/conspam-power-point-presentation.html' title='ConSpam Power Point Presentation'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110744385263841627</id><published>2005-02-03T07:17:00.000-08:00</published><updated>2005-02-03T07:17:32.636-08:00</updated><title type='text'>review questions</title><content type='html'>Today i had my project review . Some of the questions&lt;br /&gt;that was asked by the guide are:&lt;br /&gt;&lt;br /&gt;1) What are the attributes do u filter the mail.&lt;br /&gt;              List out the what are all the things u&lt;br /&gt;check in order to consider to derive the conclusion of&lt;br /&gt;spam and legitimate message.&lt;br /&gt;&lt;br /&gt;2) If mail is the literal translation of the tamil&lt;br /&gt;words wriiten in english then what do the filter do.&lt;br /&gt;Whether it will consider it as spam or it will pass&lt;br /&gt;through the filter.&lt;br /&gt;&lt;br /&gt;Ans : ( I guess)&lt;br /&gt;   &lt;br /&gt;   If the mail is written in english though the words &lt;br /&gt;represent  tamil words( ie literal translation of the&lt;br /&gt;tamil sentences in english)l then each token will be&lt;br /&gt;given the probability of 0.4 (as the tokens are all&lt;br /&gt;new) according to bayesian filter concept.  Thus the&lt;br /&gt;mail will be considered as non spam mail.&lt;br /&gt;&lt;br /&gt;3) As there is tamil sentence written in english.&lt;br /&gt;There are chances that users may use different&lt;br /&gt;spelling to represent the same word. Then what is the&lt;br /&gt;case?&lt;br /&gt;&lt;br /&gt;Ans : ( I guess)&lt;br /&gt;&lt;br /&gt;There are possibility of only a few words to be&lt;br /&gt;misspelled or spelled differently. At this case, while&lt;br /&gt;calculating the combined probabilty there are chances&lt;br /&gt;of leaving those words and thus due to calculation of&lt;br /&gt;combined probability the mail will pass through the&lt;br /&gt;filter.&lt;br /&gt;&lt;br /&gt;4) bayesian filter is better than  what other filters.&lt;br /&gt;Say with reason.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;-- &lt;br /&gt;O.R.Vaishnavi Devi &lt;orvaish@hotpop.com&gt;&lt;br /&gt;Thiagarajar College&lt;br /&gt;&lt;br /&gt;________________________________________________________________________&lt;br /&gt;Yahoo! India Matrimony: Find your life partner online&lt;br /&gt;Go to: http://yahoo.shaadi.com/india-matrimony&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110744385263841627?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110744385263841627/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110744385263841627' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110744385263841627'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110744385263841627'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/02/review-questions.html' title='review questions'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110735334708036891</id><published>2005-02-02T06:09:00.000-08:00</published><updated>2005-02-02T06:09:07.080-08:00</updated><title type='text'>Worst Kind of SPAM</title><content type='html'>&lt;a href="http://polls.slashdot.org/pollBooth.pl?qid=1232&amp;aid=-1"&gt;/. Poll&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110735334708036891?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110735334708036891/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110735334708036891' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110735334708036891'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110735334708036891'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/02/worst-kind-of-spam.html' title='Worst Kind of SPAM'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110727841136912873</id><published>2005-02-01T09:20:00.000-08:00</published><updated>2005-02-03T07:23:59.156-08:00</updated><title type='text'>Whitelist</title><content type='html'>&lt;span style="font-family: verdana;font-size:100%;" &gt;Whitelist:&lt;br /&gt;    Whitelist is similar to address book where in we keep the details about the known senders.&lt;br /&gt;Having whitelist in the filter has an advantage that we can accept the mails from the senders in the whitelist without any filtering there by saving computation.&lt;br /&gt;But the problem is that one sender can have different email address. If the known sender has send the mail with his new address and the mail contains words that may increase the probabilty of being considered as spam then there are chances for false positives when passed through the filter.&lt;br /&gt;&lt;br /&gt;The whilelist associated with the bayesian solves this problem that as the entire header is checked the senders route, protocol used, the ip address are noted. Thus though the sender uses the different address for the mail the other fields may indicate that the message is from the known person and the mail will be accepted without filtering.&lt;/span&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110727841136912873?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110727841136912873/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110727841136912873' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110727841136912873'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110727841136912873'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/02/whitelist.html' title='Whitelist'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110727839362522113</id><published>2005-02-01T09:19:00.000-08:00</published><updated>2005-02-03T07:24:38.243-08:00</updated><title type='text'>Bayesian Filter</title><content type='html'>&lt;span style="font-family: verdana;font-size:100%;" &gt;Explanation of Bayesian Filter&lt;br /&gt;Article : A plan for Spam&lt;br /&gt;&lt;br /&gt;There are 2 corpus one(corpus1) which is used to store the legitimate mails ie the mails that are deleted by the user ordinarily goes to this corpus. Another one(corpus2) for storing spam messages that are obtained when the user deletes his received mail using the option of "delete as spam".&lt;br /&gt;&lt;br /&gt;Now the messages are scanned entirely in the corpus1(good corpus) and separated as tokens. Here the whole message including the header, Java script, html  are scanned.  Then each token is taken and the number of time that token occur in the whole corpus is counted and stored in a hash table(good). Thus the hash table contains the mapping between the token and the number of times it occurs in the corpus.&lt;br /&gt;The same procedure is repeated with the corpus2(spam corpus) to get another hash table(bad) which also contains the mapping between the tokens in the corpus2 and their number of occurances.&lt;br /&gt;&lt;br /&gt;A third has table is created mapping the token with the probabilty that the mail containing it is a spam.&lt;br /&gt;The fomula used for calculating the probability is as follows.&lt;br /&gt;&lt;br /&gt;(let((g(*2(or gethash word good) 0)))&lt;br /&gt;    (b(or (gethash word bad) 0)))&lt;br /&gt;(unless (&amp;gt; ( + g b) 5)&lt;br /&gt;(max 0.1 (min .99 (float (/ (min 1(/ b nbad)) ( + (min 1 (/ g ngood)) ( min 1 ( / b nbad)))))))&lt;br /&gt;&lt;br /&gt;Explanation:&lt;br /&gt;    Here word is the token for which the probability is to be calculated. Gethash means getting the number of occurance of the word from the hash tables ( good or bad). ngood and nbad is the total number of messages in the corpus1 and corpus2 respectively.&lt;br /&gt;&lt;br /&gt;The code here is a LISP code. In Lisp the expression (a+b) is written as (+ a b).&lt;br /&gt;&lt;br /&gt;To understand the code we need to know first about the probability. Suppose there are 2 red,3 white balls. Then the probability of white balls is 3/5 ie number of white balls divided by total number of balls present.&lt;br /&gt;And also another point to be noted is the probability always lies between 0 and 1.&lt;br /&gt;&lt;br /&gt;Here in our code in order to reduce/avoid false positive we actually do two things&lt;br /&gt;1) double the good words.&lt;br /&gt;2) then while calculating the probability we consider the divisor to be total number of messages in the corpus rather than total number of words in the corpos which is the actual thing we have to consider by the definition of probability.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Thus the variable g = 2 * (the number of  occurance of the word in the good hash table if present or 0).&lt;br /&gt;&lt;br /&gt;And b = the number of  occurance of the word in the good hash table if present or 0.&lt;br /&gt;&lt;br /&gt;One  point to be considered is that the probability of the new word(ie the token that is not present in any of the corpus) is considered as 0.4.&lt;br /&gt;&lt;br /&gt;We consider only those words that have occured more than 5 times when both the corpuses are taken together.  This is checked using the line of code (&amp;gt; (+ g b) 5) which is similar to g+b&amp;gt;5. If the condition satisfies then the following steps are performed.&lt;br /&gt;&lt;br /&gt;1)Then calculate g/ngood and compare it with 1 and get the minimum of both.&lt;br /&gt;2)Calculate b/nbad and compare it with 1 and get the minimum of both.&lt;br /&gt;&lt;br /&gt;The comparsion with 1 is done in order to have a maximum of only 1 ie some times it so happens that the value of the ratio might be &amp;gt; 1 but the probabilty can be from 0 to 1 thus the values &amp;gt; 1 are rounded of to 1 to satisfy the  probability condition.&lt;br /&gt;&lt;br /&gt;3) Add the results of step 1 nad step 2.&lt;br /&gt;4) divide the  b/nbad by taking the result of step 3 as denominator.&lt;br /&gt;5) Adjust the value so that it should be betwwen 0.1 to 0.99.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;For better explnation consider the word zzzz that surely indicates the msg containg it is spam. The word has occured 20 times and there are 10 msg in the corpus2. The b/nbad becomes 2. And suppose g/ngood is around 0.1 (suppose) then applying step 3 will result in value like 2.10. Step 4 will then be 2/2.10 which will surely be around 1 , in our case it is 0.9523. Now the min(0.99,0.9523) is taken. here it is 0.9523. And then max(0.9523,0.1) is taken .The result is 0.9523 which becomes the probabilty of the word  zzzz.&lt;br /&gt;&lt;br /&gt;After calculating the probability of each token from the message to be tested as spam, the most interesting 15 tokens are taken based on how far they are from the neutral 0.5.(I dont know what they mean by this ).&lt;br /&gt;&lt;br /&gt;Then the combined probability is calculated using the formula(This formula seems to be difficult to understand):&lt;br /&gt;(let((prod(apply # ' *  probs)))&lt;br /&gt;(/ prod ( + prod ( apply # ' * (mapcar # ' (lambda(x) (-1 x)) probs ))))).&lt;br /&gt;&lt;br /&gt;The mail is considered as spam only if this combined probability results in a value greater than 0.9.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;/span&gt;&lt;span style="font-size:100%;"&gt;&lt;br /&gt; &lt;/span&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110727839362522113?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110727839362522113/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110727839362522113' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110727839362522113'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110727839362522113'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/02/bayesian-filter.html' title='Bayesian Filter'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110710903128753492</id><published>2005-01-30T10:17:00.001-08:00</published><updated>2005-02-03T07:50:51.846-08:00</updated><title type='text'>Some Useful Terms</title><content type='html'>&lt;span style="font-size:100%;"&gt;Spyware :&lt;br /&gt;      Programs that causes your computer to display ads even when you are not using the program in question for its intended purpose.&lt;br /&gt;Spyware hijacks computers,secretly changing their settings, bauanges them with pop up ads, and installs adware and other software program that may cause computer to malfunction, slow down and even crash.&lt;br /&gt;&lt;br /&gt;Phishing Attacks:&lt;br /&gt;&lt;br /&gt;The fraudulent solicitation for account information such as credit card numbers and passwords by impersonating the domain and email content of a company.&lt;/span&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110710903128753492?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110710903128753492/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110710903128753492' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110710903128753492'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110710903128753492'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/some-useful-terms.html' title='Some Useful Terms'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110710906752900218</id><published>2005-01-30T10:17:00.000-08:00</published><updated>2005-02-03T07:25:12.630-08:00</updated><title type='text'>SPIT</title><content type='html'>&lt;span style="font-family: verdana;font-size:100%;" &gt;&lt;b&gt;&lt;u&gt;SPIT&lt;br /&gt;&lt;/u&gt;&lt;/b&gt;&lt;br /&gt;Audio Spam over Internet Telephony. The users voiceboxes could become clogged with the unsolicitated Advertising messages.&lt;br /&gt;&lt;br /&gt;The Solution is a filter that identifies calls likes to be spam, based on the frequency and duration of the calls and then removes them.&lt;/span&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110710906752900218?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110710906752900218/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110710906752900218' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110710906752900218'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110710906752900218'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/spit.html' title='SPIT'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110707294314075358</id><published>2005-01-30T00:15:00.000-08:00</published><updated>2005-01-30T00:15:43.140-08:00</updated><title type='text'>For Spammers</title><content type='html'>&lt;h1&gt;&lt;strong&gt;orvaish@hotpop.com&lt;/strong&gt;&lt;/h1&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="mailto:orvaishnavi@hotpop.com"&gt;&lt;h1&gt;&lt;strong&gt;orvaish@hotpop.com&lt;/strong&gt;&lt;/h1&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Ready to combat the ConSpam filter??!&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110707294314075358?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110707294314075358/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110707294314075358' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110707294314075358'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110707294314075358'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/for-spammers.html' title='For Spammers'/><author><name>rays</name><uri>http://www.blogger.com/profile/15466949790773743608</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110680552942865917</id><published>2005-01-26T21:58:00.000-08:00</published><updated>2005-01-26T21:58:49.426-08:00</updated><title type='text'>Microsoft Predicts Spam will be solved by 2006</title><content type='html'> Microsoft chairman, Bill Gates, predicted that the digital world would be spam-free in 2006. Gates made this prognostication at the January 2004 conference in Switzerland.&lt;br /&gt;&lt;br /&gt;However, with just about a year towards a spam-free web experience, a lot of work has yet to be done. Today, 60 percent of all emails sent worldwide is still spam, according to a salon.com article entitled “How Microsoft is losing the war of spam.”&lt;br /&gt;&lt;br /&gt;Ironically, according to the article, written by Brian McWilliams, experts are pointing their fingers at Microsoft as the root of the spam problem. It can also be the key to solving it, it was added.&lt;br /&gt;&lt;br /&gt;Mr. McWilliams wrote, “Most junk email today emanates from Windows computers that spammers have hijacked and turned into spam ‘zombies’ using security holes in Microsoft's operating system. What's more, Microsoft is blamed for wrecking efforts this past summer to create email authentication standards. The company also stands accused of trying to neuter state anti-spam laws. And Microsoft has yet to win a lawsuit against a major spammer.”&lt;br /&gt;&lt;br /&gt;Although, the spam problem should be addressed through a concerted effort by the government, the industry and the users themselves, experts still believe that Microsoft is in a position to totally eradicate spam, if only it had the will to do more.&lt;br /&gt;&lt;br /&gt;But, why isn’t Microsoft doing more?&lt;br /&gt;&lt;br /&gt;John Levine, chairman of the Anti-Spam Research Group, says Microsoft is acting as if software piracy were a much bigger problem than protecting users against spam and viruses. He recommended, “Microsoft should give away security upgrades to unauthorized users of Windows, even if doing so undercuts the firm's campaign against software piracy.”&lt;br /&gt;&lt;br /&gt;Microsoft’s other crimes, according to Salon, are its lack of participation in the creation of email standards, and its attorneys’ inability to win a single legal battle against major spammers. &lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110680552942865917?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110680552942865917/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110680552942865917' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110680552942865917'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110680552942865917'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/microsoft-predicts-spam-will-be-solved.html' title='Microsoft Predicts Spam will be solved by 2006'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110520217776624101</id><published>2005-01-08T08:36:00.001-08:00</published><updated>2005-01-08T08:36:17.766-08:00</updated><title type='text'>r we not making our project big!</title><content type='html'> &lt;DIV&gt;&lt;FONT face=Verdana color=#0000ff size=2&gt;&lt;/FONT&gt;&amp;nbsp;&lt;/DIV&gt; &lt;BLOCKQUOTE style="MARGIN-RIGHT: 0px"&gt;   &lt;DIV&gt;&lt;/DIV&gt;   &lt;DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left&gt;&lt;FONT    face=Tahoma size=2&gt;-----Original Message-----&lt;BR&gt;&lt;B&gt;From:&lt;/B&gt; Vaishnavi    Devi&amp;nbsp;&lt;FONT face=Verdana&gt;&lt;FONT color=#0000ff&gt;&lt;SPAN    class=026412014-08012005&gt;&amp;nbsp; &amp;nbsp;&lt;/SPAN&gt;&lt;SPAN    class=026412014-08012005&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;/SPAN&gt;&lt;/FONT&gt;&lt;/FONT&gt;&lt;BR&gt;&lt;BR&gt;&lt;/FONT&gt;&lt;/DIV&gt;   &lt;P&gt; &lt;BR&gt;hi dha,&lt;BR&gt;&amp;nbsp; Our idea seems to extend the project making it a    great and very big and there creates a doubt whether it can be finished within    the limited time. &lt;BR&gt;I disscussed with the staff, he says to take 4 systems    in the lab, 2 system will be considerd as the valid ip system and 2 other the    system will be considerd invalid. ie if have to acccept mails only from valid    system s and not from invalid. This sounds similar to sender id i think so.    Then he says while designing phase study of all the existing ideas while    implementing do this idea and if we finish this within time we can enhance    further.&lt;BR&gt;His point is should be considered coz as humans we will be using    atleast 3 to 4hr completely for project. And i have to finish this within    feb.&lt;BR&gt;Now i am very much confused.Rather i am afraid whether i can be able    to finish this (the reason can be said in phone and i am browsing from    college) I think u would have understood.&lt;BR&gt;dilse,&lt;BR&gt;vaish.    &lt;/P&gt;bye,&lt;BR&gt;~Vaishnavi Devi~&lt;BR&gt;&lt;SPAN class=026412014-08012005&gt;&lt;FONT    face=Verdana color=#0000ff size=2&gt;&amp;nbsp;  &amp;nbsp;&lt;/FONT&gt;&lt;/SPAN&gt;&lt;/BLOCKQUOTE&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110520217776624101?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110520217776624101/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110520217776624101' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110520217776624101'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110520217776624101'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/r-we-not-making-our-project-big.html' title='r we not making our project big!'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110520216887794560</id><published>2005-01-08T08:36:00.000-08:00</published><updated>2005-01-08T08:36:08.876-08:00</updated><title type='text'>r we making our filter stricter</title><content type='html'> &lt;DIV&gt;&lt;FONT face=Verdana color=#0000ff size=2&gt;&lt;SPAN  class=183411914-08012005&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;/FONT&gt;&lt;/DIV&gt; &lt;BLOCKQUOTE style="MARGIN-RIGHT: 0px"&gt;   &lt;DIV&gt;&lt;/DIV&gt;   &lt;DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left&gt;&lt;FONT    face=Tahoma size=2&gt;-----Original Message-----&lt;BR&gt;&lt;B&gt;From:&lt;/B&gt; Vaishnavi    Devi&amp;nbsp;&lt;FONT face=Verdana&gt;&lt;FONT color=#0000ff&gt;&lt;SPAN    class=183411914-08012005&gt;&amp;nbsp; &amp;nbsp;&lt;/SPAN&gt;&lt;SPAN    class=183411914-08012005&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;/SPAN&gt;&lt;/FONT&gt;&lt;/FONT&gt;&lt;BR&gt;&lt;BR&gt;&lt;/FONT&gt;&lt;/DIV&gt;   &lt;P&gt;&amp;nbsp; hi dha,&lt;BR&gt;&amp;nbsp; &amp;nbsp; I also started by reading the article "A    plan for spam". I didnot understand some of the concepts like having the hsah    tables and then not checking the emails that have entry in the whitelist. They    have included the problem with the whitelist ie when the old friend send msg    with new or other id there is probability of considering it as spam.&lt;BR&gt;&amp;nbsp;    &amp;nbsp; can u plz explain the artical with example so as to make it    clear.&lt;BR&gt;&lt;BR&gt;Then i did not understand the spam with neural    networks.&lt;BR&gt;&lt;BR&gt;Our idea of 3 level.&lt;BR&gt;Does it works and r v making our    filter stricter.&lt;BR&gt;Coz in pauls article it is said that if we make our    filters stricter ther is a chance for more false positives.&lt;BR&gt;&lt;BR&gt;In review    section how should i present&lt;BR&gt;What the modules we r going to do?&lt;BR&gt;Why r we    going for C language?&lt;BR&gt;&lt;BR&gt;Do reply to this!&lt;BR&gt;&lt;BR&gt;dilse,&lt;BR&gt;vaish&lt;BR&gt;&lt;SPAN    class=183411914-08012005&gt;&lt;FONT face=Verdana color=#0000ff size=2&gt;&amp;nbsp;    &amp;nbsp;&lt;/FONT&gt;&lt;/SPAN&gt; &lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110520216887794560?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110520216887794560/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110520216887794560' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110520216887794560'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110520216887794560'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/r-we-making-our-filter-stricter.html' title='r we making our filter stricter'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-10029294.post-110519378349164932</id><published>2005-01-08T06:13:00.000-08:00</published><updated>2005-01-08T06:16:23.490-08:00</updated><title type='text'>Hi World</title><content type='html'>Let us show the &lt;span style="font-weight: bold;"&gt;Way To Hell &lt;/span&gt;to &lt;span style="font-weight: bold;"&gt;SPAM&lt;/span&gt; and &lt;span style="font-weight: bold;"&gt;SPAMMERS&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/10029294-110519378349164932?l=conspam.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://conspam.blogspot.com/feeds/110519378349164932/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=10029294&amp;postID=110519378349164932' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110519378349164932'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/10029294/posts/default/110519378349164932'/><link rel='alternate' type='text/html' href='http://conspam.blogspot.com/2005/01/hi-world.html' title='Hi World'/><author><name>Senthil Kumaran</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='//lh3.googleusercontent.com/-xzrzQ5Ehqr4/AAAAAAAAAAI/AAAAAAAAAAA/4xHGk-b3vAk/s512-c/photo.jpg'/></author><thr:total>0</thr:total></entry></feed>
