分两步:
1、抓取HTML
- =info
- 523066680/vicyang
- 2018-10
- =cut
- use utf8;
- use Encode;
- use File::Path;
- use File::Slurp;
- use LWP::UserAgent;
- use File::Path;
- use File::Basename qw/basename/;
- use Mojo::DOM;
- STDOUT->autoflush(1);
- our $wdir = encode('gbk', "D:/temp/句子大全");
- mkpath $wdir unless -e $wdir;
- our $main = "http://www.1juzi.com";
- our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
- my $res = $ua->get($main);
- my $html = $res->content();
- my $dom = Mojo::DOM->new($html);
- my (@urls, @dirs);
- get_item($dom, \@urls, \@dirs);
- my $tdir;
- for my $id ( 0 .. $#urls )
- {
- printf "%s\n", $dirs[$id];
- next if -e $dirs[$id]; # Skip when this folder exists
- $tdir = $dirs[$id] ."_";
- mkpath $tdir unless -e $tdir;
- get_alist( $main .$urls[$id], $tdir );
- rename( $tdir , $dirs[$id] ); # Restore name
- }
- sub get_item
- {
- our $wdir;
- my ($dom, $urls, $dirs) = @_;
- my $menu = $dom->at(".header-menu");
- for my $e ( $menu->find("ul li a")->each )
- {
- push @$urls, $e->attr("href");
- push @$dirs, sprintf "%s/%s/%s", $wdir, $e->parent->parent->previous->text, $e->text;
- }
- }
- sub get_alist
- {
- our $main;
- my ($url, $dir) = @_;
- my $res = $ua->get( $url );
- my $dom = Mojo::DOM->new( $res->content );
- my @links;
- @links = @{ $dom->at(".alist")->find("a")->map(attr=>"href") };
- #get_page
- my $retry;
- for my $link ( @links )
- {
- printf " %s\n", $link;
- $retry = 0;
- do
- {
- $res = $ua->get( $main .$link );
- $retry++;
- print "retry times: $retry\n" if ($retry > 1 );
- }
- until ( $res->is_success() );
- write_file( $dir ."/". basename($link), $res->content );
- }
- }
2、从本地HTML提取文本分类汇总
- =info
- 523066680/vicyang
- 2018-10
- =cut
- use utf8;
- use Encode;
- use File::Slurp;
- use Mojo::DOM;
- STDOUT->autoflush(1);
- our $wdir = encode('gbk', "D:/Temp/句子大全");
- chdir $wdir or warn "$!";
- my $buff;
- my @files;
- my @dirs = `dir "$wdir" /ad /s /b`;
- grep { s/\r?\n//; } @dirs;
- for my $dir ( @dirs )
- {
- printf "%s\n", $dir;
- chdir $dir or die "$!";
- @files = glob "*.html";
- next unless $#files >= 0;
- $buff = "";
- grep { $buff .= article( $_ ) } sort { substr($b, 0, -5) <=> substr($a, 0, -5) } @files;
- write_file( "${dir}.txt", $buff );
- }
- sub article
- {
- my $file = shift;
- my $html = decode('gbk', scalar(read_file( $file )) );
- $html =~s/ //g;
- $dom = Mojo::DOM->new( $html );
- # remove tags: <script>, <u>, and next/prev page
- grep { $_->remove } $dom->at(".content")->find("script")->each;
- grep { $_->remove } $dom->at(".content")->find("u")->each;
- $dom->at(".page")->remove;
- my $title = $dom->at("h1")->all_text;
- my $text = $dom->at(".content")->all_text;
- $text =~s/(\d+、)/\n$1/g;
- $text =~s/\Q$title\E//;
- $text =~s/[\r\n]+/\n/g;
- $text =~s/^\n//;
- my $str;
- $str = sprintf "%s\n", encode('gbk', $title );
- $str .= sprintf "%s\n", $file;
- $str .= sprintf "%s\n", encode('gbk', $text);
- return $str;
- }