nodejs ruby python php goでヤフートップをスクレイピング





nodejs


bash
$ npm i -S cheerio node-fetch
$ node index.js
index.js
const fetch = require('node-fetch');
const cheerio = require('cheerio');

const main = async () => {
  // https://www.yahoo.co.jp/にリクエスト投げる
  const _ret = await fetch('https://www.yahoo.co.jp/', {
    method: 'get',
    headers: {
      'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    },
    referrer: ''
  }).catch(err => {
    console.log(err);
  });

  if (_ret.status !== 200) {
    console.log(`error status:${_ret.status}`);
    return;
  }

  // jqueryチックに使えるように変換
  const $ = cheerio.load(await _ret.text());

  const _li = $('main article section ul')
    .eq(0)
    .find('li');

  // ヤフートップニュースを表示
  _li.map(function(i) {
    console.log(_li.eq(i).text());
    console.log(
      _li
        .eq(i)
        .find('a')
        .attr()['href']
    );
    console.log();
  });
};

main();



ruby


bash
$ gem install nokogiri
$ ruby index.rb
index.rb
require "nokogiri"
require "open-uri"

doc = Nokogiri::HTML(open("https://www.yahoo.co.jp", "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"))

test = doc.css("main article section ul")[0].css("li")

test.each do |li|
  puts li.content
  puts li.css("a")[0][:href]
  puts
end



go


bash
$ go get github.com/PuerkitoBio/goquery
$ go run index.go
index.go
package main

import (
  "fmt"
  "log"
  "net/http"

  "github.com/PuerkitoBio/goquery"
)

func main() {
  req, _ := http.NewRequest("GET", "http://yahoo.co.jp", nil)
  req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
  res, _ := new(http.Client).Do(req)

  if res.StatusCode != 200 {
    log.Fatalf("status code error: %d %s\n", res.StatusCode, res.Status)
  }

  doc, err := goquery.NewDocumentFromReader(res.Body)
  if err != nil {
    log.Println(err)
  }

  li := doc.Find("main article section ul").Eq(0).Find("li")
  li.Each(func(index int, s *goquery.Selection) {
    fmt.Println(s.Text())

    tmp, err := s.Find("a").Attr("href")
    if err != true {
      log.Fatal(err)
    }
    fmt.Println(tmp + "\n")
  })
}



php


phpQueryでググってphpQuery-onefile.phpをダウンロードする。

bash
$ php index.php
index.php
<?php

require_once './phpQuery-onefile.php';

function my_curl($url)
{
  $cp = curl_init();

  /*オプション:リダイレクトされたらリダイレクト先のページを取得する*/
  curl_setopt($cp, CURLOPT_RETURNTRANSFER, 1);

  /*オプション:URLを指定する*/
  curl_setopt($cp, CURLOPT_URL, $url);

  /*オプション:タイムアウト時間を指定する*/
  curl_setopt($cp, CURLOPT_TIMEOUT, 30);

  /*オプション:ユーザーエージェントを指定する*/
  curl_setopt($cp, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36');

  $data = curl_exec($cp);

  curl_close($cp);

  return $data;
}

$url = 'https://www.yahoo.co.jp';
$doc = phpQuery::newDocument(my_curl($url));

$ul = $doc->find('main article section')->find("ul:eq(0)");

for ($i = 0; $i < count($ul->find("li")); ++$i) {
  $li = $ul->find("li:eq($i)");
  echo  $li[0]->text();
  echo "\n";
  echo $li[0]->find("a")->attr('href').PHP_EOL;
  echo "\n";
}
?>



python


bash
$ pip install requests
$ pip install beautifulsoup4
$ python index.py
index.py
import urllib.request as request
from bs4 import BeautifulSoup

req = request.Request(
    "https://www.yahoo.co.jp",
    None,
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
)

instance = request.urlopen(req)
soup = BeautifulSoup(instance, "html.parser")

li = soup.select('main article section ul')[0].select('li')

for m in li:
    print(m.text)
    print(m.select("a")[0].get("href"))
    print()



ホームへ