Diggin_Scraperでドコモの機種情報をスクレイピング
開発者向け情報 | サービス・機能 | NTTドコモから機種情報をスクレイピング。
<?php require_once 'Diggin/Scraper.php'; function getDevice($value) { preg_match('/([^(]+)(((.*)))?/iu', $value, $match); $device = $match[1]; return $device; } function getFont($value) { preg_match_all('/((([^)]*)))?(\d+)×(\d+)/iu', $value, $match); $fonts = array(); for ($i = 0; $i < count($match[0]); $i++) { $key = !empty($match[2][$i]) ? $match[2][$i] : $i; $fonts[$key] = array( 'width' => $match[3][$i], 'height' => $match[4][$i], ); } return $fonts; } function getCharactor($value) { $value = (string)$value; preg_match_all('/((([^\n]*))\n?)?(\d+)/iu', $value, $match); $characters = array(); for ($i = 0; $i < count($match[0]); $i++) { $key = !empty($match[2][$i]) ? $match[2][$i] : $i; $characters[$key] = $match[3][$i]; } return $characters; } function getBrowser($value) { preg_match_all('/(\d+)[^\d]+(\d+)((([^\d)]*)))?/iu', $value, $match); $screens = array(); for ($i = 0; $i < count($match[0]); $i++) { $key = !empty($match[4][$i]) ? $match[4][$i] : $i; $screens[$key] = array( 'width' => $match[1][$i], 'height' => $match[2][$i], ); } return $screens; } function getDisplay($value) { preg_match_all('/((?([^\d)]*))?)?(\d+)×(\d+)/iu', $value, $match); $screens = array(); for ($i = 0; $i < count($match[0]); $i++) { $key = !empty($match[2][$i]) ? $match[2][$i] : $i; $screens[$key] = array( 'width' => $match[3][$i], 'height' => $match[4][$i], ); } return $screens; } function getColor($value) { preg_match_all('/(白黒|カラー)(\d+)/iu', $value, $match); $color = array( 'type' => $match[1][0], 'num' => $match[2][0], ); return $color; } try { $url = 'http://www.nttdocomo.co.jp/service/imode/make/content/spec/screen_area/index.html'; $profile1 = new Diggin_Scraper_Process(); $profile1->process('/td[1]/span[@class="txt"]', 'device => "TEXT", getDevice') ->process('/td[2]/span[@class="txt"]', 'font => "TEXT", getFont') ->process('/td[3]/span[@class="txt"]', 'charactor => "RAW", getCharactor') ->process('/td[4]/span[@class="txt"]', 'browser => "TEXT", getBrowser') ->process('/td[5]/span[@class="txt"]', 'display => "TEXT", getDisplay') ->process('/td[6]/span[@class="txt"]', 'color => "TEXT", getColor'); $profile2 = new Diggin_Scraper_Process(); $profile2->process('/td[2]/span[@class="txt"]', 'device => "TEXT", getDevice') ->process('/td[3]/span[@class="txt"]', 'font => "TEXT", getFont') ->process('/td[4]/span[@class="txt"]', 'charactor => "RAW", getCharactor') ->process('/td[5]/span[@class="txt"]', 'browser => "TEXT", getBrowser') ->process('/td[6]/span[@class="txt"]', 'display => "TEXT", getDisplay') ->process('/td[7]/span[@class="txt"]', 'color => "TEXT", getColor'); $scraper = new Diggin_Scraper(); $scraper->process('//table/tr[@class="acenter"][count(td)=6]', array('profile[]' => $profile1)) ->process('//table/tr[@class="acenter"][count(td)=7]', array('profile[]' => $profile2)) ->scrape($url); print_r($scraper->results); } catch (Exception $e) { die($e); }
テーブルのtdの数が6か7かで記述が重複してるのがいけてない。
追記:
書き直してみた。
<?php try { $url = 'http://www.nttdocomo.co.jp/service/imode/make/content/spec/screen_area/index.html'; $profile = new Diggin_Scraper_Process(); $profile->process('/td[last()-5]/span[@class="txt"]', 'device => "TEXT", getDevice') ->process('/td[last()-4]/span[@class="txt"]', 'font => "TEXT", getFont') ->process('/td[last()-3]/span[@class="txt"]', 'charactor => "RAW", getCharactor') ->process('/td[last()-2]/span[@class="txt"]', 'browser => "TEXT", getBrowser') ->process('/td[last()-1]/span[@class="txt"]', 'display => "TEXT", getDisplay') ->process('/td[last()-0]/span[@class="txt"]', 'color => "TEXT", getColor'); $scraper = new Diggin_Scraper(); $scraper->process('//table/tr[@class="acenter"]', array('profile[]' => $profile)) ->scrape($url); print_r($scraper->results); } catch (Exception $e) { die($e); }
trが7個のときは先頭のシリーズの列を除外したいので、「最後から何個目か」を指定するようにした。すっきり。