#!/usr/bin/env ruby
### extlogkeywords --- Utility for getting search keywords from web log files.
## Copyright 2008 by Dave Pearson
## $Revision: 1.7 $
##
## extlogkeywords is free software distributed under the terms of the GNU
## General Public Licence, version 2. For details see the file COPYING.
# Quick, dirty and very simplistic weblog search term analysis tool. Needs a
# fair bit of work to be more generally useful (for example, it handles my
# logs, it might not handle yours).
# Stuff we require.
require 'cgi'
############################################################################
# Class for reading an apache log line.
class ApacheLogLine
# Attributes
attr_reader :line
attr_reader :ip
attr_reader :time
attr_reader :request
attr_reader :status
attr_reader :size
attr_reader :referrer
attr_reader :browser
##########################################################################
# Constructor.
def initialize( line )
# Save the line.
@line = line
# Try and pull the useful bits out of it.
if match = /^([\d\.]+) .*?\[(.*?)\] "(.*?)" (\d+) (.*?) "(.*?)"/.match( line )
@ip = match[ 1 ]
@time = match[ 2 ]
@request = match[ 3 ]
@status = match[ 4 ]
@size = match[ 5 ]
@referrer = match[ 6 ]
end
end
end
############################################################################
# Class for getting information about a seatrch engine query.
class QueryString
# Attributes
attr_accessor :string
##########################################################################
# Constructor.
def initialize( line )
if ( search = /[\?&]q=(.*?)[&\"]/.match( line.referrer ) )
@string = CGI::unescape( search[ 1 ] ).downcase.strip.gsub( /^cache:.+? /, "" )
end
end
##########################################################################
# Is there a query in this line?
def query?
@string != nil
end
##########################################################################
# Get the words from the line.
def words
unless @words
@words = @string.gsub( /[,\"\\]/, "" ).gsub( /\+/, " " ).split.select do |word|
case word
# Ignore really common words.
when "a", "of", "the", "to", "or", "in", "is", "and", "for", "+", "on", "at", "!", "-"
false
# Ignore things that will generally be Google directives.
when /^-[a-z]+?:/
false
else
true
end
end.uniq
end
@words
end
end
############################################################################
# Base report class.
class Report
# Attributes
attr_accessor :min
##########################################################################
# Constructor.
def initialize
@min = 0
end
##########################################################################
# Consume a line from a log.
def consume( query )
puts query.string
end
##########################################################################
# Emit the report.
def emit
# GNDN
end
end
############################################################################
# Like the base report but with a cleaned-up query.
class ReportWords < Report
##########################################################################
# Consume a line from a log.
def consume( query )
puts query.words.join( " " )
end
end
############################################################################
# Report that produces a sorted table of phrases.
class QueryTableReport < Report
##########################################################################
# Constructor.
def initialize
super
@phrases = Hash.new( 0 )
end
##########################################################################
# Consume a line from a log.
def consume( query )
@phrases[ query.string ] += 1
end
##########################################################################
# Emit the report.
def emit
@phrases.sort() {|a,b| b[ 1 ] <=> a[ 1 ] }.each do |phrase|
if phrase[ 1 ] >= @min
puts "#{phrase[ 0 ].ljust( 60 )}#{phrase[ 1 ].to_s.rjust( 10 )}"
end
end
end
end
############################################################################
# Report that produces a sorted table of cleaned phrases.
class CleanQueryTableReport < Report
##########################################################################
# Constructor.
def initialize
super
@phrases = Hash.new( 0 )
end
##########################################################################
# Consume a line from a log.
def consume( query )
@phrases[ query.words.join( " " ) ] += 1
end
##########################################################################
# Emit the report.
def emit
@phrases.sort() {|a,b| b[ 1 ] <=> a[ 1 ] }.each do |phrase|
if phrase[ 1 ] >= @min
puts "#{phrase[ 0 ].ljust( 60 )}#{phrase[ 1 ].to_s.rjust( 10 )}"
end
end
end
end
############################################################################
# Report that produces a sorted table of words.
class WordTableReport < Report
##########################################################################
# Constructor.
def initialize
super
@words = Hash.new( 0 )
end
##########################################################################
# Consume a line from a log.
def consume( query )
query.words.each {|word| @words[ word ] += 1 }
end
##########################################################################
# Emit the report.
def emit
@words.sort() {|a,b| b[ 1 ] <=> a[ 1 ] }.each do |word|
if word[ 1 ] >= @min
puts "#{word[ 0 ].ljust( 60 )}#{word[ 1 ].to_s.rjust( 10 )}"
end
end
end
end
############################################################################
# Report that produces a "word graph"
class GraphReport < Report
############################################################################
# Holds the details of a single word.
class Word
# Attributes
attr_reader :word
attr_reader :count
##########################################################################
# Constructor.
def initialize( word )
@word = word
@count = 0
end
##########################################################################
# Increase the count.
def inc!
@count += 1
end
end
############################################################################
# Holds the details of a word with its relationships.
class RelatedWords < Word
# Attributes
attr_reader :related
##########################################################################
# Constructor.
def initialize( word )
super
@related = Hash.new {|hash,key| hash[ key ] = Word.new( key ) }
end
##########################################################################
# Add a related word.
def <<( word )
@related[ word ].inc!
end
end
############################################################################
# Constructor.
def initialize
super
@graph = Hash.new {|hash,key| hash[ key ] = RelatedWords.new( key ) }
end
##########################################################################
# Consume a line from a log.
def consume( query )
query.words.each do |word|
@graph[ word ].inc!
( query.words - [ word ] ).each {|other| @graph[ word ] << other }
end
end
##########################################################################
# Emit the report.
def emit
@graph.sort {|k1,k2|
k2[ 1 ].count <=> k1[ 1 ].count
}.each do |word,value|
puts "#{word} (#{value.count})"
value.related.sort {|k1,k2|
k2[ 1 ].count <=> k1[ 1 ].count
}.each do |word,value|
puts "\t#{word} (#{value.count})"
end
end
end
end
############################################################################
# Report that produces a "word graph" in a form that can be used with
# the GraphViz tools.
class DotGraph < GraphReport
# Attributes
attr_accessor :linked_only
attr_accessor :url
attr_accessor :with_counts
##########################################################################
# Emit the report.
def emit
# Figure out the colour step.
colour_step = 128 / @graph.to_a.inject( 0 ) do |n,word|
[ n, word[ 1 ].count ].max
end
# Start of data.
puts "graph {"
# For each word in the main list....
@graph.each do |word,value|
# If its count is more than the required min...
if value.count >= @min
# Check if we're ignoring non-linked words.
unless @linked_only and value.related.empty?
# Sort out the label.
if @with_counts
label = "label=\"#{word}\\n(#{value.count})\" "
else
label = ""
end
# Sort out the URL.
if @url
url = " URL=\"#{@url}\" target=\"_blank\"" % word
else
url = ""
end
# Configure the node.
puts "\t\"#{word}\" [#{label}style=\"filled\" fillcolor=\"#0000#{( 127 + ( value.count * colour_step ) ).to_s( 16 )}\" fontcolor=\"white\"#{url}];"
# For each related word...
value.related.each do |related,related_value|
# Emit the connection (with duplicate protection)
if word > related and related_value.count >= @min
puts "\t\"#{word}\" -- \"#{related}\";"
end
end
end
end
end
# End of data.
puts "}"
end
end
############################################################################
# Main utility code.
if $0 == __FILE__
# We're going to use long options.
require "getoptlong"
# Set the default parameters.
$params = {
:dot_linked_only => false,
:dot_url => "",
:dot_with_counts => false,
:emit => "querytable",
:help => false,
:min => 0
}
# Print the help screen.
def printHelp
print "extlogkeywords v#{/(\d+\.\d+)/.match( '$Revision: 1.7 $' )[ 1 ]}
Copyright 2008 by Dave Pearson
http://www.davep.org/
Supported command line options:
-e --emit Specify the type of report to emit.
See \"Supported reports\" below.
-m --min Specify the minimum count to include in
a report (where appropriate).
Command line options related to dot file output:
--dot-with-counts Display counts in nodes.
--dot-url Associate URL with nodes.
This is a format string. Use %s where you
want the node name to appear in the URL.
--dot-linked-only Only include words that are related to
other words.
Supported reports:
Name Short Description
========== ===== ===============================================
queries q Simply lists every query found.
words w Simply lists all words found.
querytable qt Sorted table of queries found, with counts.
cleanquerytable cqt As above, but cleaned up a little.
wordtable wt Sorted table of words found, with counts.
graph g Graph data for all words found.
dot d Output that can be used with GraphViz.
"
end
# Get the arguments from the command line.
begin
GetoptLong.new().set_options(
[ "--emit", "-e", GetoptLong::REQUIRED_ARGUMENT ],
[ "--help", "-h", GetoptLong::NO_ARGUMENT ],
[ "--dot-with-counts", GetoptLong::NO_ARGUMENT ],
[ "--dot-url", GetoptLong::REQUIRED_ARGUMENT ],
[ "--dot-linked-only", GetoptLong::NO_ARGUMENT ],
[ "--min", "-m", GetoptLong::REQUIRED_ARGUMENT ]
).each {|name, value| $params[ name.gsub( /^--/, "" ).gsub( /-/, "_" ).intern ] = value }
rescue GetoptLong::Error
printHelp()
exit 1
end
# User wants help?
if $params[ :help ]
printHelp()
exit 0
end
# Create the reporting object.
case $params[ :emit ]
when "queries", "q" then report = Report.new
when "words", "w" then report = ReportWords.new
when "querytable", "qt" then report = QueryTableReport.new
when "cleanquerytable", "cqt" then report = CleanQueryTableReport.new
when "wordtable", "wt" then report = WordTableReport.new
when "graph", "g" then report = GraphReport.new
when "dot", "d"
report = DotGraph.new
report.with_counts = $params[ :dot_with_counts ]
report.linked_only = $params[ :dot_linked_only ]
report.url = $params[ :dot_url ]
else
raise RuntimeError.new(), "Unknown report type '#{$params[ :emit ]}'"
end
# Set the min count.
report.min = $params[ :min ].to_i
# Consume every line on stdin.
$stdin.each do |line|
if ( search = QueryString.new( ApacheLogLine.new( line ) ) ).query?
report.consume( search )
end
end
# Emit the report.
report.emit
end
### extlogkeywords ends here