Python(2) Pinterest Pin

目的

AI (機械学習)用の画像を取得しようと思ったら、意外に引っかかる。

PythonでPinterestの検索を行い、その結果の画像のURL等の情報を取得した。

Pinterest APIについては、検索ができるのは自分のアカウントのBoardとPinだけらしい。

Pinterest APIではすべてのPinを対象にしたい。

AI (機械学習)用の画像は、参考2のような、公開するデータセットを利用する方法もある。

コード

参考1のコードそのままだが

import os, sys, time
import requests
import json
import bs4 # beautifulSoupe4
import re  # for "findall"
 
# Save an image file
def save_image(file_name, image):
    with open(file_name, 'wb') as f:
        f.write(image)

def search(query, num_pins):
 
    # First access
    url     = 'https://www.pinterest.jp/search/pins/'
    headers = {
        'connection': 'keep-alive'
    }
 
    search_response = requests.get(url, params={'q':query}, headers=headers, stream=False)
    soup            = bs4.BeautifulSoup(search_response.text.replace('\n',''), 'html5lib')
 
    data_json_string = soup.find('script', type='application/json') # extract json string
    data_json        = json.loads(data_json_string.string) # convert into dictionary type variable
    results          = data_json['tree']['children'][0]['data']['results']
#    results          = data_json['resouceDataCache'][0]['children'][0]['data']['results']
 
    image_info_list  = []
    for r in results:
        image_info = {}
        image_info['description'] = r['description']
        image_info['link']        = r['link']
        image_info['image_url']   = r['images']['orig']['url']
        image_info['id']          = r['id']
        image_info_list.append(image_info)
 
 
    # Second or later access to load additional pins that are responded as a JSON string
    url             = 'https://www.pinterest.jp/resource/BaseSearchResource/get/'
    bookmarks       = data_json['resourceDataCache'][0]['resource']['options']['bookmarks']
    experiment_hash = data_json['context']['triggerable_experiments_hash']
    last_cookies    = search_response.cookies
 
    while len(image_info_list) < num_pins:
 
        ## Preparing parameters, headers and cookies for the "get" request
        params = {
            'source_url':'/search/pins/?q={}'.format(query),
            'data':json.dumps({
                'options':{
                    'bookmarks':bookmarks,
                    'query':query,
                    'scope':'pins',
                    'page_size':25,
                    'field_set_key':'unauth_react'
                },
                'context':{}}),
            '_':str(int(time.time())*10*10*10)
        }
 
        headers = {
            'Host':'www.pinterest.jp',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept-Language':'ja,en-US;q=0.7,en;q=0.3',
            'X-Pinterest-AppState': 'background',
            'X-Pinterest-ExperimentHash': experiment_hash,
            'X-NEW-APP':'1',
            'X-APP-VERSION':'9b11f84',
            'X-Requested-With':'XMLHttpRequest',
            'Referer':'https://www.pinterest.jp',
            'cookie':json.dumps({
                '_auth':dict(last_cookies)['_auth'],
                'csrftoken':dict(last_cookies)['csrftoken'],
                '_pinterest_sess':dict(last_cookies)['_pinterest_sess']}),
            'connection':'keep-alive'
        }
 
        cookies = {
            '_auth':dict(last_cookies)['_auth'],
            'csrftoken':dict(last_cookies)['csrftoken'],
            '_pinterest_sess':dict(last_cookies)['_pinterest_sess'],
            'bei':'False',
            'logged_out':'True',
            'fba':'True',
            'sessionFunelEventLogged':'1'
        }
 
        search_response = requests.get(url, cookies=cookies, params=params, headers=headers, stream=False)
        data_json       = json.loads(search_response.text)
        results         = data_json['resource_response']['data']['results']
 
        bookmarks       = data_json['resource']['options']['bookmarks']
        experiment_hash = data_json['client_context']['triggerable_experiments_hash']
        last_cookies    = search_response.cookies
 
        for r in results:
            image_info = {}
            image_info['description'] = r['description']
            image_info['link']        = r['link']
            image_info['image_url']   = r['images']['orig']['url']
            image_info['id']          = r['id']
            image_info_list.append(image_info)
 
    return image_info_list
 
 
def main(argv):
    keyword  = 'xxx' # keyword you want to search
    num_pins = 100 # Number of pins searched
    img_dir  = 'images'
    timeout = 10 # in second
    params  = {} # not used
    cookies = {} # not used
    headers = {} # not used
 
    image_info_list = search(keyword, num_pins)

    for img_info in image_info_list:
  img_url = img_info['image_url']
        # Retrieve the file name of the image
        name_search = re.findall(r'\/([a-zA-Z0-9:.=_-]*jpg|jpeg|JPG|JPEG)', img_url)
        img_name    = name_search[0]
 
        # Get the content of the image
        img_response = requests.get(img_url, timeout=timeout, params=params, cookies=cookies, headers=headers, stream=False)
        if img_response.raise_for_status() != None:
            sys.exit('HTTP Error When Accessing The Image File!') # if not suceessed, this script will be terminated
 
        # Save the image
        save_image('./'+img_dir+'/'+img_name, img_response.content)
 
 
if __name__ == '__main__':
    main(sys.argv)

search関数に検索ワードと取得したい画像数をいれると、結果はimagesに保存される!

参考

  1. http://hassiweb-programming.blogspot.com/2017/07/retrieve-pinterest-pins-by-python.html — PythonでPinterestのPin (画像)の検索結果を取得する
  2. https://ai.google/tools/datasets/