每次执行会叠加更新数据,保存为 perldb 文件,使用 Storable 模块做转储
- =info
- 爬取订单记录、积累数据
- Author: 523066680/vicyang
- Date: 2019-03
- 方案:获取所有旧数据,获取所有在线数据,将数据合并。
- =cut
- use Modern::Perl;
- use Encode;
- use File::Slurp;
- use Storable;
- use FindBin;
- use lib "$FindBin::Bin/../Function";
- use GetOrders;
- STDOUT->autoflush(1);
- my $item = "2050788225";
- my $dbfile = "Brief_${item}.perldb";
- my $old = get_original( $dbfile );
- my $online = GetOrders::get_online_orders( $item );
- printf "orders in db: %d\n", 1+$#$old;
- printf "orders online: %d\n", 1+$#$online;
- say "Merging";
- GetOrders::merge( $old, $online );
- printf "Dumping, items: %d\n", $#$old+1;
- store $old, $dbfile;
- say "Done";
- exit;
- sub get_original
- {
- my $dbfile = shift;
- my $data = -e $dbfile ? retrieve( $dbfile ) : [];
- return $data;
- }
模块代码(路径 ..\Function\GetOrders.pm):
- package GetOrders;
- use Modern::Perl;
- use Mojo::UserAgent;
- use Mojo::DOM;
- use Date::Parse;
- use Date::Format;
- our $main = "https://feedback.aliexpress.com";
- our $url = "https://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm";
- our $ua = Mojo::UserAgent->new();
- our @headers = (
- "Host" => "www.aliexpress.com",
- "User-Agent" => "Firefox/63.0",
- );
- sub get_online_orders
- {
- my ($item) = @_;
- my $orders = [];
- my %args = (
- "productId" => $item,
- "type" => "default",
- "page" => "1"
- );
- my $pgcode = 1;
- my $total = 1;
- my $parts;
- my @all;
- # 按页码顺序获取,数据存储到 @all
- while ( $pgcode <= $total and $pgcode <= 10 )
- {
- printf "Current Page %d\n", $pgcode;
- $args{'page'} = $pgcode;
- my $res = $ua->get( $url, form => \%args )->result;
- redo unless $res->is_success;
- ($parts, $total) = read_json( $res, $pgcode );
- # $parts 返回升序序列,但不同页码的时间段是降序,unshift 将其反转。
- unshift @all, $parts;
- $pgcode++;
- }
- # 按日期(升序)合并
- grep { merge( $orders, $_ ) } @all;
- return $orders;
- }
- # merge 是可复用函数,可以在外部使用。
- sub merge
- {
- my ( $orders, $parts ) = @_;
- if ( $#$orders < 0 ) {
- @$orders = @$parts;
- } else {
- my $sect = 0;
- for my $id ( 0 .. $#$parts-1 ) {
- if ( $parts->[$id][0] == $orders->[-2][0] and
- $parts->[$id+1][0] == $orders->[-1][0] )
- {
- $sect = $id+2;
- last;
- }
- }
- #printf "Sect: %d\n", $sect;
- push @$orders, @{$parts}[$sect .. $#$parts];
- }
- }
- sub read_json
- {
- my ($res, $page) = @_;
- my $node = $res->json;
- my $data = [];
- # 如果页码不对,返回空数据
- return [] if $node->{page}->{current} != $page;
- for my $e ( @{$node->{records}} )
- {
- # 降序反转为升序
- unshift @$data, [
- str2time($e->{date}),
- $e->{countryCode},
- $e->{quantity},
- $e->{name},
- $e->{buyerAccountPointLeval},
- ];
- }
- return ( $data, $node->{page}->{total} );
- }
- 1;