0 Posted 2020-01-22Updated 2024-01-11Python / Crawlera minute read (About 172 words)

Crawler (爬虫)

1. Quick Start

Crawler img location sites from National Geographic web site & downloading them.

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests

## Starting resuqest
html = urlopen("http://www.nationalgeographic.com.cn/animals/").read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')

img_links = soup.find_all("img", {"src": re.compile('http://image..*?\.jpg')})
for link in img_links:
    print(link['src']) # pic locationg

## With adding this
## mkdir img # 创建一个img文件夹

for link in img_links:
    print(link['src'])
    if link['src'][0:4] == 'http':
        url = link['src']
        r = requests.get(url, stream=True)
        image_name = url.split('/')[-1]
        with open('./img/%s' % image_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size=128):
                f.write(chunk)
        print('Saved %s' % image_name)