Участник:ShurShur/getwikidumps.pl

См. также: Участник:ShurShur/Dumps

#!/usr/bin/perl
# getwikidumps 20060621 by Shurik
# 
# This cript downloads all Wikimedia dumps from URL $base/$language$project
# 
use Getopt::Long;
#use AppConfig;
use LWP::Simple;
use strict;

# standard projects (w/o language prefix)
my @projects=(
  "wiki",         # Wikipedia
  "wikibooks",    # WikiBooks
  "wikinews",     # WikiNews
  "wikiquote",    # WikiQuote
  "wikisource",   # WikiSource
  "wiktionary"    # Wiktionary
);

# language prefixes and references to projects
my %languages=(
  "ru"=>\@projects, # Russian
  "commons"=>["wiki"],
  "meta"=>["wiki"],
);

# files to skip
my @skipfiles=(
  "pages-meta-history.xml.bz2" # Dup of better compressed pages-meta-history.xml.7z
);

# base url
my $base="http://download.wikimedia.org/";

# autoflush
$|=1;

sub help {
  my $me=$0;
  $me=~s#.*/##;
  print <<EOF;
Usage: $me [OPTIONS] <project definitions>....
Project definitions:
  <lang>:<project> - e.g. ru:wikibooks for ru.wikibooks.org or meta:wiki for meta.wikimedia.org
  <lang>           - same as <lang>:wiki
  <lang>:          - same as <lang>:wiki
Options:
  -h               - this help
Sample call:
  $me ru: ru:wikibooks meta commons:wiki
    - download dumps for ruwiki, ruwikibooks, metawiki & commonswiki
EOF
  exit;
}

# parse @ARGV
my $owndumplist=0;
sub adddump {
  if(!$owndumplist) {
    $owndumplist=1;
    %languages=qw();
  }
  my ($tmp)=@_;
  my ($lang,$project)=split /:/,$tmp;
  if($project=~/^$/) { $project="wiki"; }
  my $ref=$languages{$lang};
  if(!$ref) {
    $ref=$languages{$lang}=[];
  }
  push @$ref,$project;
}
GetOptions("-h"=>\&help,"<>"=>\&adddump) || help;

sub getlastdate {
  my ($language,$project)=@_;
  my $pg=get "$base/$language$project/";
  my @tmp=($pg=~m#href="(\d{8})/"#ig);
  return pop @tmp;
}

sub getfilelist {
  my ($language,$project,$date)=@_;
  my $pg=get "$base/$language$project/$date/";
  my @tmp=($pg=~m#./$language$project-$date-(.+?)"#ig);
  return @tmp;
}

sub getfile {
  my ($language,$project,$date,$file)=@_;
  my $url="$base/$language$project/$date/$language$project-$date-$file";
  my $loc="$language$project/$date/$language$project-$date-$file";
  my $skip=0;
  if(-f $loc) {
    $skip=1;
  } else {
    for my $cfile (@skipfiles) {
      if($cfile eq $file) {
        $skip=1;
        last;
      }
    }
  }
  my $ok=0;
  if($skip) {
    print scalar(localtime).": [$language] $project [$date] $file ";
  } else {
    print scalar(localtime).": [$language] $project [$date] $file ";
    system "mkdir -p $language$project/$date/";
    my $cmd="wget -c $url -O $loc.tmp 2>&1";
    sleep 1; # prevent quick reconnect
    open P,"$cmd |";
    while(<P>) {
      if(/^\d\d:\d\d:\d\d\s\((.+?)\)\s-\s\`.+?\'\ssaved\s\[(.+?)\]/) {
        print "($1)\n";
	$ok=1;
      }
      if(/The\sfile\sis\salready\sfully\sretrieved;\snothing\sto\sdo/) {
        print "(exists)\n";
	$ok=1;
      }
    }
    close P;
  }
  if($ok) {
    rename "$loc.tmp",$loc;
  } else {
    print "(skipped)\n";
  }
}

sub getproject {
  my ($language,$project)=@_;
  my @date=getlastdate $language,$project;
  if($date[0]) {
    for my $date (@date) {
      printf scalar(localtime).": [$language] $project [$date] started\n";
      my @list=getfilelist $language,$project,$date;
      for my $file(@list) {
        getfile $language,$project,$date,$file;
      }
      printf scalar(localtime).": [$language] $project [$date] completed\n";
    }
  } else {
    printf scalar(localtime).": [$language] $project [not found]\n";
  }
}

for my $language (keys %languages) {
  for my $project (@{$languages{$language}}) {
    getproject $language,$project;
  }
}

 

Prefix: a b c d e f g h i j k l m n o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9

Portal di Ensiklopedia Dunia