#!/usr/bin/perl use warnings; use strict; # The idea is to be able to change these later print "Word count weight? (The higher, the more "; print "you care about simple word matches) [1-10]\n"; my $wordCountWeight = ; print "Distance weight? (The higher, the more "; print "you care about how far apart the words are) [0-10]\n"; my $distanceWeight = ; print "Not-found modifier? (The higher, the more "; print "you care about when words aren't in the search order) [0-10]\n"; my $notFoundModifier = ; print "Trigram multiplier? (The higher, the more "; print "you care about words in the EXACT order of the words) [1-2]\n"; my $trigramMultiplier = ; print "Give me a filename\n"; my $filename = ; my $lineScore = 0; my $i = 0; my $lineNum = 0; my $matchFlag = 0; my %sortedHash; my $bestLine = ''; my $bestScore = -100; open (FIN,$filename); my @lines = ; close(FIN); my $numLoops = @ARGV; my $numPairs = $numLoops - 1; foreach my $line (@lines) { chomp($line); for($i=0;$i<$numLoops;$i++) { my $searchString = $ARGV[$i]; while ($line =~ /$searchString/ig) { $lineScore += $wordCountWeight; } } if ($lineScore == 0) { next; } else { print "********************************************\n"; } ###################### # Distance searching # ###################### if ($numPairs < 1) { print "No pairs -- skipping distance searching\n"; } for($i=0;$i<$numPairs;$i++) { my $w1 = $ARGV[$i]; my $w2 = $ARGV[$i+1]; my @splitLine = split(/ /,$line); for(my $k=0;$k<@splitLine;$k++) { if ($splitLine[$k] =~ /$w1/ig) { my $matchFlag = 0; print "found $w1 @ $k, continuing for $w2\n"; my $tokenNum = $k + 1; while (($matchFlag == 0) && (my $token = $splitLine[$tokenNum])) { if ($token =~ /$w2/ig) { print "!! second word found at $tokenNum -- "; print "score $lineScore -> "; $lineScore = $lineScore + (($distanceWeight/($tokenNum - $k)) * $distanceWeight); print "$lineScore\n"; $matchFlag = 1; } $tokenNum++; } if ($matchFlag == 0) { # match not found print " never found second word -- "; print "score $lineScore -> "; $lineScore -= $notFoundModifier; # change to variable later print "$lineScore\n"; } } } } #################### # Do trigrams here # #################### if ($numLoops < 2) { print "Less than 2 words -- skipping trigrams\n"; } my @splitLine = split(/ /,$line); my $argIndex = 0; while ($argIndex < (@ARGV-2)) { my $w1 = $ARGV[$argIndex]; #print "word 1: $w1\n"; my $w2 = $ARGV[$argIndex+1]; #print "word 2: $w2\n"; my $w3 = $ARGV[$argIndex+2]; #print "word 3: $w3\n"; for(my $j=0;$j<@splitLine;$j++) { if ($splitLine[$j] =~ /$w1/ig) { print " first sequence trigram found\n"; if ($splitLine[$j+1] =~ /$w2/ig) { print " second sequence trigram found -- score $lineScore -> "; $lineScore *= $trigramMultiplier; print "$lineScore\n"; if($splitLine[$j+2] =~ /$w3/ig) { print " third sequence trigram found -- score $lineScore -> "; $lineScore *= $trigramMultiplier; print "$lineScore\n"; } } } } $argIndex++; } print "--> \"$line\"\n"; print "### Final Score: $lineScore\n"; if ($lineScore > $bestScore) { $bestScore = $lineScore; $bestLine = $line; } $lineNum++; $lineScore = 0; } print "================================================================\n"; print "Best Match: \n"; print "Score: $bestScore\n"; print "'$bestLine'\n";