#!/usr/bin/perl -COL # This small script is (c) 2013 Volker Schatz. It may be copied and/or # modified under the same terms as Perl. use strict; use warnings; use LWP::UserAgent; use URI::Escape; use XML::Parser; my $nresults= 30; # API allows <= 50 my @props= qw( snippet size ); # Properties (see https://wiki.archlinux.org/api.php): # size - Adds the size of the page in bytes # wordcount - Adds the word count of the page # timestamp - Adds the timestamp of when the page was last edited # score - Adds the score (if any) from the search engine # snippet - Adds a parsed snippet of the page # titlesnippet - Adds a parsed snippet of the page title # redirectsnippet - Adds a parsed snippet of the redirect title # redirecttitle - Adds the title of the matching redirect # sectionsnippet - Adds a parsed snippet of the matching section title # sectiontitle - Adds the title of the matching section # hasrelated - Indicates whether a related search is available my $propstr= join "|", @props; my $offset= 0; $offset= $nresults * length(pop @ARGV) if @ARGV && $ARGV[-1] =~ /^\++$/; my $where= $offset ? "text" : "title"; $offset -= $nresults if $offset; if( !@ARGV || $ARGV[0] =~ /^-+h(?:elp)?$/i ) { print < ... [ +.. ] Searches English-language pages in the Arch Linux Wiki for keywords. If the last argument consists entirely of plus signs, full page texts are searched rather than just the title. The number of plus signs minus one indicates a page offset in the search results. Every page contains the English-language subset of $nresults results from the Wiki API. EOF exit; } # Get request to Arch Wiki API. # -> Hash reference to GET arguments # <- Retrieved document sub api_submit { my ($args)= shift; my $ua= LWP::UserAgent->new( "env_proxy" => 1 ); my $url= "https://wiki.archlinux.org/api.php?" . join("&", map $_."=".$$args{$_}, keys %$args); my $response= $ua->get($url); die "Error retrieving API search result. URL was:\n$url\n" unless $response->is_success(); return $response->decoded_content(); } my $searchstr= join("+", map uri_escape($_), @ARGV); my %args= ( action => "query", list => "search", format => "xml", srsearch => $searchstr, srprop => $propstr, srlimit => $nresults, sroffset => $offset, srwhat => $where, srredirects => 0 ); my $total= 0; my @results; # Filter out English-language results by discarding anything ending on a # closing parenthesis. sub filter_results { my (undef, $tag, %attrs)= @_; return if $tag ne "p" || !$attrs{title}; ++$total; return if $attrs{title} =~ /\)\s*$/; push @results, \%attrs; } XML::Parser->new(Handlers => { Start => \&filter_results } )->parse(api_submit(\%args)); print "Found ", 0+@results, " English results out of a total of $total.\n\n"; for my $res (@results) { my $url= $res->{title}; $url =~ s/\s/_/g; $url= "https://wiki.archlinux.org/index.php/$url"; print "$url\n\n"; for (keys %$res) { next if /^ns$/i; # always present, always 0 $$res{$_} =~ s/\s+$//; $$res{$_} =~ s/\n+/\n /g; $$res{$_} =~ s//\x1B[7m/g; $$res{$_} =~ s/<\/span>/\x1B[0m/g; print " ", ucfirst($_), ": $$res{$_}\n\n"; } } # Bugs: # - snippets are raw Wiki markup except for span tags indicating matches # - redirect pages cannot be excluded ? tried srredirects = 0, false, off # - non-English pages are recognised by the language in parentheses in the # title, but some non-English pages do not have it