- =info
- 523066680/vicyang
- 2018-10
- =cut
- use utf8;
- use Encode;
- use LWP::UserAgent;
- use File::Path;
- use File::Slurp;
- use File::Basename qw/basename/;
- use Mojo::DOM;
- STDOUT->autoflush(1);
- our $wdir = encode('gbk', "D:/temp/力成文学");
- mkpath $wdir unless -e $wdir;
- our $main = "http://www.ceasm.com";
- our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
- my $res = $ua->get($main);
- my $html = $res->content();
- my $dom = Mojo::DOM->new($html);
- my ($url, $dir, $buff, $item );
- my (@sUrl, @sItem, @list, $article);
- # 一级栏目
- for my $e ( $dom->at(".menu")->find("[target]")->each )
- {
- $url = $e->attr("href");
- $item = $e->text;
- printf "%s\n", $e->text;
- # 二级栏目
- get_subitem( $url, \@sUrl, \@sItem );
- for my $id ( 0 .. $#sUrl )
- {
- printf " %s\n", $sItem[$id];
- $dir = "${wdir}/${item}/${sItem[$id]}/";
- mkpath $dir unless -e $dir;
- # 所有文章链接
- @list = list( $main . $sUrl[$id] );
- for my $link ( @list )
- {
- printf " %s\n", $link;
- $buff = article($main . $link);
- write_file( $dir . basename($link), $buff );
- }
- }
- }
- # 二级栏目
- sub get_subitem
- {
- my ( $url, $links, $names ) = @_;
- my $res = $ua->get( $url );
- $dom = Mojo::DOM->new( $res->content() );
- @$links = map { $_->attr("href") } ( $dom->at(".keywords")->find("[target]")->each );
- @$names = map { $_->text } ( $dom->at(".keywords")->find("[target]")->each );
- }
- # 文段列表
- sub list
- {
- my $link = shift;
- my $res = $ua->get( $link );
- $dom = Mojo::DOM->new( $res->content() );
- # 获取最大页面值
- $dom->at(".pagelist")->find("a")->last =~ /(\d+)/;
- my $max = $1;
- my @list;
- for my $id ( 1 .. $max )
- {
- $res = $ua->get( $link ."list${id}.html" );
- $dom = Mojo::DOM->new( $res->content() );
- push @list, map { $_->attr("href") } $dom->at(".dedelist")->find("h4 [target]")->each;
- }
- return @list;
- }
- sub article
- {
- my $link = shift;
- my $res;
- do { $res = $ua->get( $link ); } until ( length($res->content) > 2000 );
- return $res->content;
- }
文章提取以及导出
- =info
- 523066680/vicyang
- 2018-10
- =cut
- use utf8;
- use Encode;
- use File::Slurp;
- use LWP::UserAgent;
- use Mojo::DOM;
- STDOUT->autoflush(1);
- our $main = "http://www.ceasm.com";
- our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
- our $wdir = encode('gbk', "D:/Temp/力成文学");
- chdir $wdir or warn "$!";
- my $buff;
- my @files;
- my @dirs = `dir "$wdir" /ad /s /b`;
- grep { s/\r?\n//; } @dirs;
- for my $dir ( @dirs )
- {
- printf "%s\n", $dir;
- chdir $dir or die "$!";
- @files = glob "*.html";
- next unless $#files >= 0;
- $buff = "";
- grep { $buff .= article( $_ ) } sort { substr($b, 0, -5) <=> substr($a, 0, -5) } @files;
- write_file( "${dir}.txt", $buff );
- }
- sub article
- {
- my $file = shift;
- my $html = decode('gbk', scalar(read_file( $file )) );
- $html =~s/ /#CRLF/g;
- $html =~s/\n/#CRLF/g; # ------> 1
- $dom = Mojo::DOM->new( $html );
- my $title = $dom->at("h2")->all_text;
- my $text = $dom->at(".text")->all_text;
- $text =~s/\s//g; # ------> 2 去掉所有空白符号包括 space tab \r \n 全角空白符
- $text =~s/(\d+、)/\n$1/g;
- $text =~s/\Q$title\E//;
- $text =~s/#CRLF/\n/g;
- $text =~s/[\r\n]+/\n/g;
- $text =~s/^\n//;
- my $str;
- #标题
- $str = sprintf "%s\n", encode('gbk', $title );
- $str .= sprintf "%s\n", $file;
- $str .= sprintf "%s\n\n", encode('gbk', $text);
- return $str;
- }
- sub xcode
- {
- $_[1]='x' if (not defined $_[1]);
- for my $v ( split(//,$_[0]) )
- {
- print sprintf ("%02$_[1] ",ord($v));
- }
- print "\n\n";
- }