Python-Google画像検索から画像をダウンロードしますか？

Question

pythonを使用して、Google画像検索のすべての画像をダウンロードしたいです。使用しているコードに問題があるようです。私のコードは

import os import sys import time from urllib import FancyURLopener import urllib2 import simplejson # Define search term searchTerm = "parrot" # Replace spaces ' ' in search term for '%20' in order to comply with request searchTerm = searchTerm.replace(' ','%20') # Start FancyURLopener with defined version class MyOpener(FancyURLopener): version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11' myopener = MyOpener() # Set count to 0 count= 0 for i in range(0,10): # Notice that the start changes for each iteration in order to request a new set of images for each loop url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0& q='+searchTerm+'&start='+str(i*10)+'&userip=MyIP') print url request = urllib2.Request(url, None, {'Referer': 'testing'}) response = urllib2.urlopen(request) # Get results using JSON results = simplejson.load(response) data = results['responseData'] dataInfo = data['results'] # Iterate for each result and get unescaped url for myUrl in dataInfo: count = count + 1 my_url = myUrl['unescapedUrl'] myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')

いくつかのページをダウンロードした後、次のようなエラーが表示されます。

トレースバック（最後の最後の呼び出し）：

 File "C:\Python27\img_google3.py", line 37, in <module> dataInfo = data['results'] TypeError: 'NoneType' object has no attribute '__getitem__'

何をすべきか？？？？？？

rishabhr0y · Answer

コードを変更しました。これで、コードは特定のクエリに対して100個の画像をダウンロードでき、画像は完全な高解像度であり、元の画像がダウンロードされています。

私はurllib2とBeautiful Soupを使用して画像をダウンロードしています

from bs4 import BeautifulSoup import requests import re import urllib2 import os import cookielib import json def get_soup(url,header): return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser') query = raw_input("query image")# you can change the query for the image here image_type="ActiOn" query= query.split() query='+'.join(query) url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch" print url #add the directory for your image here DIR="Pictures" header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36" } soup = get_soup(url,header) ActualImages=[]# contains the link for Large original images, type of image for a in soup.find_all("div",{"class":"rg_meta"}): link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"] ActualImages.append((link,Type)) print "there are total" , len(ActualImages),"images" if not os.path.exists(DIR): os.mkdir(DIR) DIR = os.path.join(DIR, query.split()[0]) if not os.path.exists(DIR): os.mkdir(DIR) ###print images for i , (img , Type) in enumerate( ActualImages): try: req = urllib2.Request(img, headers={'User-Agent' : header}) raw_img = urllib2.urlopen(req).read() cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1 print cntr if len(Type)==0: f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb') else : f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb') f.write(raw_img) f.close() except Exception as e: print "could not load : "+img print e

これがあなたのお役に立てば幸いです

jobin · Answer

Google Image Search APIは非推奨、達成したいことのために Googleカスタム検索を使用する必要があります。画像を取得するには、これを行う必要があります。

import urllib2 import simplejson import cStringIO fetcher = urllib2.build_opener() searchTerm = 'parrot' startIndex = 0 searchUrl = "http://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + searchTerm + "&start=" + startIndex f = fetcher.open(searchUrl) deserialized_output = simplejson.load(f)

これにより、JSONとして4つの結果が得られます。APIリクエストのstartIndexをインクリメントすることにより、結果を繰り返し取得する必要があります。

画像を取得するには、 cStringIO のようなライブラリを使用する必要があります。

たとえば、最初の画像にアクセスするには、これを行う必要があります。

imageUrl = deserialized_output['responseData']['results'][0]['unescapedUrl'] file = cStringIO.StringIO(urllib.urlopen(imageUrl).read()) img = Image.open(file)

Mostafa · Answer

GoogleはAPIを廃止し、Googleのスクレイピングは複雑なので、代わりにBing APIを使用することをお勧めします。

https://datamarket.Azure.com/dataset/5BA839F1-12CE-4CCE-BF57-A49D98D29A44

Googleはそれほど良くなく、Microsoftはそれほど悪くない

Piees · Answer

コードはまだ調べていませんが、これは検索用語から400枚の写真を取得しようとするSeleniumで作成されたソリューション例です

# -*- coding: utf-8 -*- from Selenium import webdriver from Selenium.webdriver.common.keys import Keys import json import os import urllib2 searchterm = 'vannmelon' # will also be the name of the folder url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch" browser = webdriver.Firefox() browser.get(url) header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"} counter = 0 succounter = 0 if not os.path.exists(searchterm): os.mkdir(searchterm) for _ in range(500): browser.execute_script("window.scrollBy(0,10000)") for x in browser.find_elements_by_xpath("//div[@class='rg_meta']"): counter = counter + 1 print "Total Count:", counter print "Succsessful Count:", succounter print "URL:",json.loads(x.get_attribute('innerHTML'))["ou"] img = json.loads(x.get_attribute('innerHTML'))["ou"] imgtype = json.loads(x.get_attribute('innerHTML'))["ity"] try: req = urllib2.Request(img, headers={'User-Agent': header}) raw_img = urllib2.urlopen(req).read() File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb") File.write(raw_img) File.close() succounter = succounter + 1 except: print "can't get img" print succounter, "pictures succesfully downloaded" browser.close()

atif93 · Answer

Piees's answer に追加すると、検索結果から任意の数の画像をダウンロードするために、最初の400件の結果が読み込まれた後、「Show more results」ボタンのクリックをシミュレートする必要があります。

from Selenium import webdriver from Selenium.webdriver.common.keys import Keys import os import json import urllib2 import sys import time # adding path to geckodriver to the OS environment variable # assuming that it is stored at the same path as this script os.environ["PATH"] += os.pathsep + os.getcwd() download_path = "dataset/" def main(): searchtext = sys.argv[1] # the search query num_requested = int(sys.argv[2]) # number of images to download number_of_scrolls = num_requested / 400 + 1 # number_of_scrolls * 400 images will be opened in the browser if not os.path.exists(download_path + searchtext.replace(" ", "_")): os.makedirs(download_path + searchtext.replace(" ", "_")) url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch" driver = webdriver.Firefox() driver.get(url) headers = {} headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" extensions = {"jpg", "jpeg", "png", "gif"} img_count = 0 downloaded_img_count = 0 for _ in xrange(number_of_scrolls): for __ in xrange(10): # multiple scrolls needed to show all 400 images driver.execute_script("window.scrollBy(0, 1000000)") time.sleep(0.2) # to load next 400 images time.sleep(0.5) try: driver.find_element_by_xpath("//input[@value='Show more results']").click() except Exception as e: print "Less images found:", e break # imges = driver.find_elements_by_xpath('//div[@class="rg_meta"]') # not working anymore imges = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]') print "Total images:", len(imges), "
" for img in imges: img_count += 1 img_url = json.loads(img.get_attribute('innerHTML'))["ou"] img_type = json.loads(img.get_attribute('innerHTML'))["ity"] print "Downloading image", img_count, ": ", img_url try: if img_type not in extensions: img_type = "jpg" req = urllib2.Request(img_url, headers=headers) raw_img = urllib2.urlopen(req).read() f = open(download_path+searchtext.replace(" ", "_")+"/"+str(downloaded_img_count)+"."+img_type, "wb") f.write(raw_img) f.close downloaded_img_count += 1 except Exception as e: print "Download failed:", e finally: print if downloaded_img_count >= num_requested: break print "Total downloaded: ", downloaded_img_count, "/", img_count driver.quit() if __name__ == "__main__": main()

完全なコードはこちらです。

Suat Atan PhD · Answer

PythonでSeleniumを使用することもできます。方法は次のとおりです。

from Selenium import webdriver import urllib from Selenium.webdriver.common.keys import Keys driver = webdriver.Chrome('C:/Python27/Scripts/chromedriver.exe') Word="Apple" url="http://images.google.com/search?q="+Word+"&tbm=isch&sout=1" driver.get(url) imageXpathSelector='//*[@id="ires"]/table/tbody/tr[1]/td[1]/a/img' img=driver.find_element_by_xpath(imageXpathSelector) src=(img.get_attribute('src')) urllib.urlretrieve(src, Word+".jpg") driver.close()

（このコードはPython 2.7）で動作します） 'pip install Selenium'でSeleniumパッケージをインストールする必要があることを通知してください here からchromedriver.exeをダウンロードする必要があります

他のWebスクレイピングテクニックとは対照的に、Seleniumのミッションはスクレイピングではなくテストであるため、Seleniumはブラウザを開いてアイテムをダウンロードします。

CumminUp07 · Answer

私はこの質問が古いことを知っていますが、私は最近それに出くわし、以前の答えはもう機能しません。そこで、Googleから画像を収集するためにこのスクリプトを書きました。現在のところ、利用可能な限り多くの画像をダウンロードできます。

こちらもgithubへのリンクです https://github.com/CumminUp07/imengine/blob/master/get_google_images.py

免責事項：著作権上の問題により、収集された画像は研究および教育目的にのみ使用されるべきです

from bs4 import BeautifulSoup as Soup import urllib2 import json import urllib #programtically go through google image ajax json return and save links to list# #num_images is more of a suggestion # #it will get the ceiling of the nearest 100 if available # def get_links(query_string, num_images): #initialize place for links links = [] #step by 100 because each return gives up to 100 links for i in range(0,num_images,100): url = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q='+query_string+'\ &tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start='+str(i)+'\ &yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s' #set user agent to avoid 403 error request = urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0'}) #returns json formatted string of the html json_string = urllib2.urlopen(request).read() #parse as json page = json.loads(json_string) #html found here html = page[1][1] #use BeautifulSoup to parse as html new_soup = Soup(html,'lxml') #all img tags, only returns results of search imgs = new_soup.find_all('img') #loop through images and put src in links list for j in range(len(imgs)): links.append(imgs[j]["src"]) return links #download images # #takes list of links, directory to save to # #and prefix for file names # #saves images in directory as a one up number # #with prefix added # #all images will be .jpg # def get_images(links,directory,pre): for i in range(len(links)): urllib.urlretrieve(links[i], "./"+directory+"/"+str(pre)+str(i)+".jpg") #main function to search images # #takes two lists, base term and secondary terms # #also takes number of images to download per # #combination # #it runs every combination of search terms # #with base term first then secondary # def search_images(base,terms,num_images): for y in range(len(base)): for x in range(len(terms)): all_links = get_links(base[y]+'+'+terms[x],num_images) get_images(all_links,"images",x) if __name__ == '__main__': terms = ["cars","numbers","scenery","people","dogs","cats","animals"] base = ["animated"] search_images(base,terms,1000)