読者です 読者をやめる 読者になる 読者になる

Diggin_Scraperでドコモの機種情報をスクレイピング

開発者向け情報 | サービス・機能 | NTTドコモから機種情報をスクレイピング

<?php
require_once 'Diggin/Scraper.php';

function getDevice($value)
{
    preg_match('/([^(]+)(((.*)))?/iu', $value, $match);
    $device = $match[1];

    return $device;
}

function getFont($value)
{
    preg_match_all('/((([^)]*)))?(\d+)×(\d+)/iu', $value, $match);
    $fonts = array();
    for ($i = 0; $i < count($match[0]); $i++) {
        $key = !empty($match[2][$i]) ? $match[2][$i] : $i;
        $fonts[$key] = array(
            'width'  => $match[3][$i],
            'height' => $match[4][$i],
        );
    }

    return $fonts;
}

function getCharactor($value)
{
    $value = (string)$value;
    preg_match_all('/((([^\n]*))\n?)?(\d+)/iu', $value, $match);
    $characters = array();
    for ($i = 0; $i < count($match[0]); $i++) {
        $key = !empty($match[2][$i]) ? $match[2][$i] : $i;
        $characters[$key] = $match[3][$i];
    }

    return $characters;
}

function getBrowser($value)
{
    preg_match_all('/(\d+)[^\d]+(\d+)((([^\d)]*)))?/iu', $value, $match);
    $screens = array();
    for ($i = 0; $i < count($match[0]); $i++) {
        $key = !empty($match[4][$i]) ? $match[4][$i] : $i;
        $screens[$key] = array(
            'width'  => $match[1][$i],
            'height' => $match[2][$i],
        );
    }

    return $screens;
}

function getDisplay($value)
{
    preg_match_all('/((?([^\d)]*))?)?(\d+)×(\d+)/iu', $value, $match);
    $screens = array();
    for ($i = 0; $i < count($match[0]); $i++) {
        $key = !empty($match[2][$i]) ? $match[2][$i] : $i;
        $screens[$key] = array(
            'width'  => $match[3][$i],
            'height' => $match[4][$i],
        );
    }

    return $screens;
}

function getColor($value)
{
    preg_match_all('/(白黒|カラー)(\d+)/iu', $value, $match);
    $color = array(
        'type' => $match[1][0],
        'num'  => $match[2][0],
    );

    return $color;
}

try {
    $url = 'http://www.nttdocomo.co.jp/service/imode/make/content/spec/screen_area/index.html';

    $profile1 = new Diggin_Scraper_Process();
    $profile1->process('/td[1]/span[@class="txt"]', 'device => "TEXT", getDevice')
             ->process('/td[2]/span[@class="txt"]', 'font => "TEXT", getFont')
             ->process('/td[3]/span[@class="txt"]', 'charactor => "RAW", getCharactor')
             ->process('/td[4]/span[@class="txt"]', 'browser => "TEXT", getBrowser')
             ->process('/td[5]/span[@class="txt"]', 'display => "TEXT", getDisplay')
             ->process('/td[6]/span[@class="txt"]', 'color => "TEXT", getColor');
    $profile2 = new Diggin_Scraper_Process();
    $profile2->process('/td[2]/span[@class="txt"]', 'device => "TEXT", getDevice')
             ->process('/td[3]/span[@class="txt"]', 'font => "TEXT", getFont')
             ->process('/td[4]/span[@class="txt"]', 'charactor => "RAW", getCharactor')
             ->process('/td[5]/span[@class="txt"]', 'browser => "TEXT", getBrowser')
             ->process('/td[6]/span[@class="txt"]', 'display => "TEXT", getDisplay')
             ->process('/td[7]/span[@class="txt"]', 'color => "TEXT", getColor');

    $scraper = new Diggin_Scraper();
    $scraper->process('//table/tr[@class="acenter"][count(td)=6]', array('profile[]' => $profile1))
            ->process('//table/tr[@class="acenter"][count(td)=7]', array('profile[]' => $profile2))
            ->scrape($url);
    print_r($scraper->results);
} catch (Exception $e) {
    die($e);
}

テーブルのtdの数が6か7かで記述が重複してるのがいけてない。


追記:
書き直してみた。

<?php
try {
    $url = 'http://www.nttdocomo.co.jp/service/imode/make/content/spec/screen_area/index.html';

    $profile = new Diggin_Scraper_Process();
    $profile->process('/td[last()-5]/span[@class="txt"]', 'device => "TEXT", getDevice')
            ->process('/td[last()-4]/span[@class="txt"]', 'font => "TEXT", getFont')
            ->process('/td[last()-3]/span[@class="txt"]', 'charactor => "RAW", getCharactor')
            ->process('/td[last()-2]/span[@class="txt"]', 'browser => "TEXT", getBrowser')
            ->process('/td[last()-1]/span[@class="txt"]', 'display => "TEXT", getDisplay')
            ->process('/td[last()-0]/span[@class="txt"]', 'color => "TEXT", getColor');

    $scraper = new Diggin_Scraper();
    $scraper->process('//table/tr[@class="acenter"]', array('profile[]' => $profile))
            ->scrape($url);
    print_r($scraper->results);
} catch (Exception $e) {
    die($e);
}

trが7個のときは先頭のシリーズの列を除外したいので、「最後から何個目か」を指定するようにした。すっきり。