#!/bin/tcsh # source folders (in IMAP spool) set src_dir_good=/var/spool/imap/filter/good set src_dir_spam=/var/spool/imap/filter/spam # working folders set train_dir_crm_good=/var/tmp/crm114_train_good set train_dir_crm_spam=/var/tmp/crm114_train_spam set train_dir_sa_good=/var/tmp/sa_train_good set train_dir_sa_spam=/var/tmp/sa_train_spam # username of spamassassin set username=vscan set crmdir=/var/amavis/.crm114 # check existence foreach dir ($train_dir_crm_good $train_dir_crm_spam $train_dir_sa_good $train_dir_sa_spam) if ( ! -ed $dir ) mkdir $dir end # delete old files foreach dir ($train_dir_crm_good $train_dir_crm_spam $train_dir_sa_good $train_dir_sa_spam) rm ${dir}/* end # for every file: # - find cached original # - check if training with SA/CRM is necessary # - link into working dir (replace ln with cp if using seperate filesystems) # NB: test >3 because i'm using Cyrus imapd which has 3 index files if ( `ls ${src_dir_spam} | wc -l` > 3 ) then foreach mailfile ( ${src_dir_spam}/*. ) set tail=`grep ^X-Spam-CRM114-CacheID $mailfile | awk '{print $2}' | sed -e 's/sfid-\(20.*\)/\1/' | tail -1` if ( $tail != "" ) then set file=`ls -1 /var/amavis/.crm114/reaver_cache/*/$tail | head -1` if ( $file != "" ) then set crmcorrect=`grep '^X-Spam-CRM114-Status: SPAM' $mailfile | wc -l` if ( $crmcorrect == 0) ln -f $file $train_dir_crm_spam set sacorrect=`grep -A 2 '^X-Spam-Status: ' $mailfile | egrep 'BAYES_[689][059]' | wc -l` if ( $sacorrect == 0 ) ln -f $file $train_dir_sa_spam endif endif end endif # here the check is inverted, because no X-Spam header also means classified # as good (not necessarily by bayes and CRM but after the complete SA run) if ( `ls ${src_dir_good} | wc -l` > 3 ) then foreach mailfile ( ${src_dir_good}/*. ) set tail=`grep ^X-Spam-CRM114-CacheID $mailfile | awk '{print $2}' |sed -e 's/sfid-\(20.*\)/\1/' | tail -1` if ( $tail != "" ) then set file=`ls -1 /var/amavis/.crm114/reaver_cache/*/$tail | head -1` if ( $file != "" ) then set crmNOTcorrect=`grep '^X-Spam-CRM114-Status: [SU][PN]' $mailfile | wc -l` if ( $crmNOTcorrect != 0 ) ln -f $file $train_dir_crm_good set saNOTcorrect=`grep -A 2 '^X-Spam-Status: ' $mailfile | egrep 'BAYES_[689][059]' | wc -l` if ( $saNOTcorrect != 0 ) ln -f $file $train_dir_sa_good endif endif end endif # make sure there is no file in the good- and the spam-set if ( `ls ${train_dir_crm_good} | wc -l` > 0 ) then foreach f ( $train_dir_crm_good/* ) set g=`basename $f` if ( -e ${train_dir_crm_spam}/$g ) then echo "Same file in CRM spam-dir and good-dir: $g" echo "E-Mail info:" egrep '^(Date|From|To): ' $f echo exit 1 endif end endif if ( `ls ${train_dir_sa_good} | wc -l` > 0 ) then foreach f ( $train_dir_sa_good/* ) set g=`basename $f` if ( -e ${train_dir_sa_spam}/$g ) then echo "Same file in SA spam-dir and good-dir: $g" echo "E-Mail info:" egrep '^(Date|From|To): ' $f echo exit 1 endif end endif # preparations ready echo linked files. # run as ${username} chown -R ${username}:${username} $train_dir_crm_good $train_dir_crm_spam $train_dir_sa_good $train_dir_sa_spam # sa-learn for SA-Bayes echo run sa-learn... su ${username} -c "sa-learn --local --showdots --spam $train_dir_sa_spam" su ${username} -c "sa-learn --local --showdots --ham $train_dir_sa_good" # CRM114 mailtrainer.crm echo run mailtrainer.crm... su ${username} -c "cd ${crmdir}; crm -u ${crmdir} mailtrainer.crm --spam=$train_dir_crm_spam/ --good=$train_dir_crm_good/ --repeat=4 --random --verbose --worst=2" echo clean dirs... rm $train_dir_crm_good/* $train_dir_crm_spam/* $train_dir_sa_good/* $train_dir_sa_spam/*