#!/usr/bin/perl -w # by xkr47 / Jonas Berlin 24.01.2005 # for Zack Brown use strict; require LWP::UserAgent; use HTTP::Date qw(parse_date); use HTML::Entities; use URI::Escape; sub parse_date2 { my $str = $_[0]; my ($year, $month, $day, $hour, $min, $sec, $tz) = parse_date($str); my $epoch; # boy, am I lazy :) ($year, $month, $day, $epoch) = split(/\s/s,`date '+%Y %m %d %s' --date='$year-$month-$day $hour:$min:$sec $tz'`); return ($epoch, $year, $month, $day); } # this function reads a mail from stdin and extracts the message id sub extract_messagedata { my %data; while() { s/\r//s; if(/^Message-ID:.*?<([^>]+)>/i) { $data{"mid"} = $1; } elsif(/^Subject:\s*(.*)/i) { $data{"subject"} = $1; } elsif(/^From:\s*(.*)/i) { my $str = $1; $data{"from"} = $str; } elsif(/^Date:\s*(.*)/i) { ($data{"epoch"}, $data{"year"}, $data{"month"}, $data{"day"}) = parse_date2($1); } last if(/^$/); } return %data; } sub convert_google_wildcard { # google replaces part of email addresses and message ids with # "...", e.g. Christoph Hellwig # this function constructs a regular expression that can be used # to match the original email / message id. my $str = $_[0]; my $re = quotemeta($str); # basic conversion $re =~ s!\\\.\\\.\\\.!.*!g; # replace ... with .* return '^\s*'.$re.'\s*$'; # add whitespace tolerance } # not very optimized, doesn't matter.. sub day_before { my ($day, $month, $year) = @_; $day--; if($day < 1) { $day = 28; $month--; if($month < 1) { $month = 12; $year--; } } return ($day, $month, $year); } sub day_after { my ($day, $month, $year, $howmany) = @_; for(my $i=0; $i<$howmany; ++$i) { $day++; if($day > 28) { $day = 1; $month++; if($month > 12) { $month = 1; $year++; } } } return ($day, $month, $year); } my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->env_proxy; $ua->agent('Mozilla/5.0'); # without this, google says "403 forbidden" my %data = extract_messagedata(); unless(defined($data{"mid"})) { print "Sorry, couldn't find a message id in your message\n"; exit(1); } my ($daym1,$monthm1,$yearm1) = day_before($data{"day"}, $data{"month"}, $data{"year"}); my ($dayp1,$monthp1,$yearp1) = day_after($data{"day"}, $data{"month"}, $data{"year"}, 2); # advance 2 days to be safe my $searchurl = 'http://groups-beta.google.com/groups?'. 'hl=en'. '&ie=UTF-8'. '&safe=off'. '&scoring=d'. # sort by date '&num=20'. # scan maximum 20 hits '&as_usubject='.uri_escape('"'.$data{"subject"}.'"'). '&as_ugroup=*linux.kernel'. '&as_drrb=b'. '&as_mind='.$daym1. '&as_minm='.$monthm1. '&as_miny='.$yearm1. '&as_maxd='.$dayp1. '&as_maxm='.$monthp1. '&as_maxy='.$yearp1. ""; my $response = $ua->get($searchurl); die $response->status_line unless $response->is_success; my @hits = split(m!href=\"/group/!,$response->content); for(my $i=1; $i<=$#hits; ++$i) { $hits[$i] =~ s!\".*!!s; unless($hits[$i] =~ m!/browse_thread/!) { $hits[$i] = undef; next; } $hits[$i] = "http://groups-beta.google.com/group/".$hits[$i]; } my %hitmasks; foreach (@hits) { $hitmasks{$_} = 3 if(defined($_)); # 1 = check older, 2 = check newer } # collect "individual" urls here, mapping to differency in seconds from message we are looking for my %foundurls; # this maps "individual" urls to "original" urls my %originalurls; # go through all hits for(my $i=1; $i<=$#hits; ++$i) { my $hiturl = $hits[$i]; next unless(defined($hiturl)); # print "\033[32;1m:: '$hiturl'\033[m\n"; my $response = $ua->get($hiturl); die $response->status_line unless $response->is_success; my $cont = $response->content; if($hitmasks{$hiturl} & 1) { if($response->content =~ m!]+href=\"([^\">]+)\">(?:[^<]|<(?:[^/]|/[^a]))*Older!s) { my $url = "http://groups-beta.google.com".$1; # print "\033[33;1mOlder = '$url'\033[m\n"; unless($hitmasks{$url}) { push @hits, $url; $hitmasks{$url} = 1; } } } if($hitmasks{$hiturl} & 2) { if($response->content =~ m!]+href=\"([^\">]+)\">Newer« Older       # Messages 26 - # 50 of 70 #       # Newer » # go through all messages for each hit my @msgs = split(m!class=scripthide!,$response->content); for(my $m=1; $m<=$#msgs; ++$m) { my $chunk = $msgs[$m]; $chunk =~ s!.*!!s; # between headers and body is , this chops body off. # fetch from header next unless($chunk =~ m!From: ([^<]+)!); my $from = decode_entities($1); my $fromre = convert_google_wildcard($from); next unless($data{"from"} =~ $fromre); # fetch date header and calc differency next unless($chunk =~ m!Date: ([^<]+)!); my ($epoch) = parse_date2($1); my $diff = abs($epoch - $data{"epoch"}); # find the "individual" url next unless($chunk =~ m!]*href="([^>]+)"[^>]*>Individual!); my $individualurl = "http://groups-beta.google.com".$1; # find the "individual" url next unless($chunk =~ m!]*href="([^>]+)"[^>]*>Show orig!); my $originalurl = "http://groups-beta.google.com".$1; $foundurls{$individualurl} = $diff; $originalurls{$individualurl} = $originalurl; } } my @foundurls_sorted = sort { $foundurls{$a} <=> $foundurls{$b} } keys %foundurls; my @potentials_without_messageid; my $finalurl; foreach my $foundurl (@foundurls_sorted) { my $originalurl = $originalurls{$foundurl}; my $response = $ua->get($originalurl); die $response->status_line unless $response->is_success; # if original-message-id not found, collect into array of potentials unless($response->content =~ m!^(?:X-)?Original-Message-ID:\s*\<(.*?)\>!mi) { push @potentials_without_messageid, $foundurl; next; } my $omid = decode_entities($1); my $omidre = convert_google_wildcard($omid); next unless($data{"mid"} =~ $omidre); $finalurl = $foundurl; last; } if(defined($finalurl)) { # message id match print "$finalurl\n"; } elsif($#potentials_without_messageid >= 0) { # no message id match, display alternatives print "Failed to match message id. Here are some possible alternatives:\n"; my $i = 1; foreach my $foundurl (@potentials_without_messageid) { print $i.". ".$foundurl."\n"; last if($i++ == 10); # show at most 10 first alternatives } print "\nAnyway, here's the search url I used:\n".$searchurl."\n"; print "DEBUG:\n"; foreach (keys %data) { print " ".$_." = '".$data{$_}."'\n"; } } elsif($#foundurls_sorted >= 0) { print "Sorry, I found some thread(s), but I couldn't find any message there with\n". "a matching message id.\n\n". "Here is the search url I used, maybe it's broken?\n".$searchurl."\n"; print "DEBUG:\n"; foreach (keys %data) { print " ".$_." = '".$data{$_}."'\n"; } } else { print "Sorry, wasn't able to find an url :P\n"; print "DEBUG:\n"; foreach (keys %data) { print " ".$_." = '".$data{$_}."'\n"; } }