[Perl]批量爬取ZOL壁纸 - 车模系列

There's more than one way to do it!
https://metacpan.org http://perlmonks.org
头像
523066680
Administrator
Administrator
帖子: 369
注册时间: 2016年07月19日 12:14
拥有现金: 锁定
储蓄: 锁定
Has thanked: 32 times
Been thanked: 36 times
联系:

[Perl]批量爬取ZOL壁纸 - 车模系列

帖子 #1 523066680 » 2018年11月03日 08:39

LWP::UserAgent 版

  • 可以关掉脚本重新运行,会判断已完成的部分节省时间。
  • 没开 keep_alive,提取效率偏低
运行环境: Straberry Perl 5.24

Mojo::UserAgent 版,提高效率,请跳转 3楼

  1. =info
  2.     Author: 523066680/vicyang
  3.     Date: 2018-11
  4. =cut
  5.  
  6. use Encode;
  7. use LWP::UserAgent;
  8. use Mojo::DOM;
  9. use File::Slurp;
  10. use File::Basename qw/basename/;
  11. use File::Path qw/mkpath/;
  12. STDOUT->autoflush(1);
  13.  
  14. our $theme = "chemo";
  15. our $wdir = "D:/temp/wallpaper_zol/$theme";
  16. our $main = "http://desk.zol.com.cn";
  17. my $ua = LWP::UserAgent->new();
  18. our @headers = (
  19.         "Host" => "desk.zol.com.cn",
  20.         "User-Agent" => "Firefox/63.0",
  21.     );
  22.  
  23. mkpath $wdir unless -e $wdir;
  24. chdir $wdir;
  25.  
  26. # 获取所有主题链接
  27. my @items;
  28. my $iter = 1;
  29. while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 )
  30. {
  31.     $iter++;
  32. }
  33.  
  34. # 遍历页面、提取图片
  35. my $idx = 1;
  36. for my $item ( @items )
  37. {
  38.     printf "[%03d/%d] %s %s\n",  $idx++ , $#items+1, $item->{link}, $item->{title};
  39.     get_pages( $item->{link}, $item->{title} );
  40. }
  41.  
  42. sub get_item
  43. {
  44.     my ( $link, $ref ) = @_;
  45.     my $res = try_to_get( $link );
  46.     my $dom = Mojo::DOM->new( $res->content );
  47.  
  48.     for my $e ( $dom->find(".photo-list-padding")->each )
  49.     {
  50.         printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
  51.         push @$ref, {'link'  => $main . $e->at("a")->attr("href"),
  52.                      'title' => $e->at("span")->attr("title") };
  53.     }
  54.     # 判断是否为最后一页
  55.     if ( defined $dom->at("#pageNext") ) { return 1 }
  56.     else {  return 0 }
  57. }
  58.  
  59. # --- Get each pages of item --- #
  60.  
  61. sub get_pages
  62. {
  63.     my ($link, $title) = @_;
  64.     my $res = try_to_get( $link );
  65.     my $dom = Mojo::DOM->new( $res->content );
  66.  
  67.     my $path = "${wdir}/${title}";
  68.     mkpath $path unless -e $path;
  69.     chdir $path;
  70.  
  71.     # 图片数量
  72.     my $pics = $dom->at(".photo-list-box li i")->text;
  73.     $pics=~s/[^\d]//;  #去除斜杠
  74.  
  75.     my @files = glob "*.jpg";
  76.     if ( $#files+1 == $pics ) {
  77.         printf "Images already exist\n";
  78.         return;
  79.     }
  80.  
  81.     for my $e ($dom->find(".photo-list-box a")->each )
  82.     {
  83.         #printf "%s\n", $e->attr("href");
  84.         get_pic( $main . $e->attr("href") );
  85.     }
  86. }
  87.  
  88. sub get_pic
  89. {
  90.     my ( $link ) = @_;
  91.     my $res = try_to_get( $link );
  92.     return unless (defined $res);
  93.  
  94.     my $dom = Mojo::DOM->new( $res->content );
  95.     my $pic_url;
  96.     my $pic_name;
  97.  
  98.     my $obj = $dom->at(".wallpaper-down dd a");
  99.     my $sub_url;
  100.  
  101.     while (1)
  102.     {
  103.         $sub_url = $obj->attr("href");
  104.         # 某些图片没有提供指定分辨率的链接
  105.         if ( $sub_url !~/\.html/ ) {
  106.             printf "Did not found picture url, skip %s\n", $sub_url;
  107.             return;
  108.         }
  109.  
  110.         $pic_name = basename($sub_url);
  111.         $pic_name =~ s/\.html/\.jpg/i;
  112.         printf "%s\n", $pic_name;
  113.         return if ( -e $pic_name );
  114.  
  115.         my $res = try_to_get( "${main}${sub_url}" );
  116.         return unless (defined $res);
  117.  
  118.         my $dom = Mojo::DOM->new( $res->content );
  119.         my $dl = $ua->mirror( $dom->at("")->attr("src"), $pic_name );
  120.  
  121.         # 如果下载失败就选择下一个分辨率的图片
  122.         if ( $dl->code != 502 ) { last }
  123.         else { $obj = $obj->next }
  124.     }
  125. }
  126.  
  127. sub try_to_get
  128. {
  129.     our @headers;
  130.     my $link = shift;
  131.     my $ua = LWP::UserAgent->new();
  132.     my $res;
  133.     my $retry = 0;
  134.     do
  135.     {
  136.         $res = $ua->get( $link, @headers );
  137.         if    ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; }
  138.         elsif ( $retry > 5 )                { print "False\n"; return undef }
  139.         $retry++;
  140.     }
  141.     until ( $res->is_success );
  142.  
  143.     return $res;
  144. }

头像
523066680
Administrator
Administrator
帖子: 369
注册时间: 2016年07月19日 12:14
拥有现金: 锁定
储蓄: 锁定
Has thanked: 32 times
Been thanked: 36 times
联系:

Re: [Perl]批量爬取ZOL壁纸 - 车模系列

帖子 #2 523066680 » 2018年11月03日 10:35

在尝试改为Mojo::UserAgent 的时候。发现官网给出的 save_to 方法是无效的

代码: 全选

# Follow redirects to download Mojolicious from GitHub
$ua->max_redirects(5)
  ->get('https://www.github.com/mojolicious/mojo/tarball/master')
  ->result->save_to('/home/sri/mojo.tar.gz');


暂时用 write_file 函数把 $res->body 写到文件

头像
523066680
Administrator
Administrator
帖子: 369
注册时间: 2016年07月19日 12:14
拥有现金: 锁定
储蓄: 锁定
Has thanked: 32 times
Been thanked: 36 times
联系:

Mojo::UserAgent

帖子 #3 523066680 » 2018年11月03日 10:49

=info
Author: 523066680/vicyang
Date: 2018-11
=cut

use Encode;
use Mojo::UserAgent;
use Mojo::DOM;
use File::Slurp;
use File::Basename qw/basename/;
use File::Path qw/mkpath/;
STDOUT->autoflush(1);

our $theme = "meishi";
our $wdir = "F:/Wallpaper/zol/$theme";
our $main = "http://desk.zol.com.cn";
our $ua = Mojo::UserAgent->new();
our @headers = (
"Host" => "desk.zol.com.cn",
"User-Agent" => "Firefox/63.0",
);

mkpath $wdir unless -e $wdir;
chdir $wdir;

# 获取所有主题链接
my @items;
my $iter = 1;
while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 )
{
$iter++;
}

# 遍历页面、提取图片
my $idx = 1;
for my $item ( @items )
{
printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title};
get_pages( $item->{link}, $item->{title} );
}

sub get_item
{
my ( $link, $ref ) = @_;
my $res = try_to_get( $link );
my $dom = $res->dom;

for my $e ( $dom->find(".photo-list-padding")->each )
{
printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
push @$ref, {'link' => $main . $e->at("a")->attr("href"),
'title' => $e->at("span")->attr("title") };
}
# 判断是否为最后一页
if ( defined $dom->at("#pageNext") ) { return 1 }
else { return 0 }
}

# --- Get each pages of item --- #

sub get_pages
{
my ($link, $title) = @_;
my $res = try_to_get( $link );
my $dom = $res->dom;

my $path = "${wdir}/${title}";
mkpath $path unless -e $path;
chdir $path;

# 图片数量
my $pics = $dom->at(".photo-list-box li i")->text;
$pics=~s/[^\d]//; #去除斜杠

my @files = glob "*.jpg";
if ( $#files+1 == $pics ) {
printf "Images already exist\n";
return;
}

for my $e ($dom->find(".photo-list-box a")->each )
{
#printf "%s\n", $e->attr("href");
get_pic( $main . $e->attr("href") );
}
}

sub get_pic
{
my ( $link ) = @_;
my $res = try_to_get( $link );
return unless (defined $res);

my $dom = $res->dom;
my $pic_url;
my $pic_name;

my $obj = $dom->at(".wallpaper-down dd a");
my $sub_url;

while (1)
{
$sub_url = $obj->attr("href");
# 某些图片没有提供指定分辨率的链接
if ( $sub_url !~/\.html/ ) {
printf "Did not found picture url, skip %s\n", $sub_url;
return;
}

$pic_name = basename($sub_url);
$pic_name =~ s/\.html/\.jpg/i;
printf "%s\n", $pic_name;
return if ( -e $pic_name );

my $res = try_to_get( "${main}${sub_url}" );
return unless (defined $res);

my $dom = $res->dom;
my $res = $ua->get( $dom->at("")->attr("src") )->result;

# 如果下载失败就选择下一个分辨率的图片
if ( $res->code == 502 ) { $obj = $obj->next; next; }

write_file( $pic_name, {binmode=>":raw"}, $res->body );
last;
}
}

sub try_to_get
{
our ($ua, @headers);
my $link = shift;
my $res;
my $retry = 0;
do
{
$res = $ua->get( $link )->result;
if ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; }
elsif ( $retry > 5 ) { print "False\n"; return undef }
$retry++;
}
until ( $res->is_success );

return $res;
}


回到 “Perl”

在线用户

用户浏览此论坛: 没有注册用户 和 2 访客