リクエストにより非同期でファイルを効率的にダウンロード

Question

私はpythonでファイルをできるだけ速くダウンロードしたいです。これが私のコードです

import pandas as pd import requests from requests_futures.sessions import FuturesSession import os import pathlib from timeit import default_timer as timer class AsyncDownloader: """Download files asynchronously""" __urls = set() __dest_path = None __user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0' __read_timeout = 60 __connection_timeout = 30 __download_count = 0 # unlimited # http://www.browserscope.org/?category=network __worker_count = 17 # No of threads to spawn __chunk_size = 1024 __download_time = -1 __errors = [] # TODO Fetch only content of a specific type from a csv # TODO Improve code structure so that it can be used as a commandline tool def set_source_csv(self, source_path, column_name): self.source_path = source_path self.column_name = column_name try: my_csv = pd.read_csv(source_path, usecols=[self.column_name], chunksize=10) except ValueError: print("The column name doesn't exist") return else: # No exception whatsoever for chunk in my_csv: AsyncDownloader.__urls.update(set(getattr(chunk, self.column_name))) def set_destination_path(self, dest_path): if dest_path.endswith('/'): dest_path = dest_path[:-1] self.dest_path = dest_path # TODO Add exception in case we can't create the directory pathlib.Path(self.dest_path).mkdir(parents=True, exist_ok=True) if os.access(self.dest_path, os.W_OK): AsyncDownloader.__dest_path = pathlib.Path(self.dest_path).resolve() def set_user_agent(self, useragent): self.useragent = useragent AsyncDownloader.__user_agent = self.useragent def set_connection_timeout(self, ctimeout_secs): self.timeout_secs = ctimeout_secs if self.timeout_secs >= 0: AsyncDownloader.__connection_timeout = self.timeout_secs def set_read_timeout(self, rtimeout_secs): self.timeout_secs = rtimeout_secs if self.timeout_secs >= 0: AsyncDownloader.__read_timeout = self.timeout_secs def set_download_count(self, file_count): self.file_count = file_count if self.file_count > 0: AsyncDownloader.__download_count = self.file_count def set_worker_count(self, worker_count): self.worker_count = worker_count if self.worker_count > 0: AsyncDownloader.__worker_count = self.worker_count def set_chunk_size(self, chunk_size): self.chunk_size = chunk_size if self.chunk_size > 0: AsyncDownloader.__chunk_size = self.chunk_size def print_urls(self): print(AsyncDownloader.__urls) def get_download_time(self): return AsyncDownloader.__download_time def get_errors(self): return AsyncDownloader.__errors def download(self): start = timer() try: session = FuturesSession(max_workers=AsyncDownloader.__worker_count) session.headers.update({'user-agent': AsyncDownloader.__user_agent}) session.request(AsyncDownloader.__connection_timeout, AsyncDownloader.__connection_timeout, stream=True) results = [] # Give an accurate file count even if we don't have to download it as it a;ready exist file_count = 0 for url in AsyncDownloader.__urls: filename = os.path.basename(url) # check if we need only a limited number of files if AsyncDownloader.__download_count != 0: # No need to download file if it already exist if pathlib.Path(AsyncDownloader.__dest_path / filename).is_file(): file_count += 1 continue else: if file_count < AsyncDownloader.__download_count: file_count += 1 results.append(session.get(url)) else: if not pathlib.Path(AsyncDownloader.__dest_path / filename).is_file(): results.append(session.get(url)) for result in results: # wait for the response to complete, if it hasn't already response = result.result() filename = os.path.basename(response.url) if response.status_code == 200: with open(pathlib.Path(AsyncDownloader.__dest_path / filename).resolve(), 'wb') as fd: for chunk in response.iter_content(chunk_size=AsyncDownloader.__chunk_size): if chunk: # filter out keep-alive new chunks fd.write(chunk) end = timer() AsyncDownloader.__download_time = end - start except requests.exceptions.HTTPError as errh: AsyncDownloader.__errors.append("Http Error:" + errh) # print("Http Error:", errh) except requests.exceptions.ConnectionError as errc: AsyncDownloader.__errors.append("Error Connecting:" + errc) # print("Error Connecting:", errc) except requests.exceptions.Timeout as errt: AsyncDownloader.__errors.append("Timeout Error:" + errt) # print("Timeout Error:", errt) except requests.exceptions.RequestException as err: AsyncDownloader.__errors.append("OOps: Something Else" + err) else: return

次のコードは非常に悪い仮定をしています。実際、最初のURLが最初に終了すると仮定していますが、これはもちろん正しくありません。

# wait for the response to complete, if it hasn't already response = result.result()

上記のような仮定を効率的に行う代わりに、完了したリクエストのみが確実に処理されるようにするにはどうすればよいですか？

パフォーマンスを改善する方法に関する他の提案をいただければ幸いです。

敬具

nitely · Accepted Answer

接続が正常に完了した場合でも、ファイルを順次処理しています。 2番目のファイルは、最初のファイルが書き込まれるまで待つ必要があります。したがって、あなたができる最善のことは、すべてを並行して処理することです（ディスクへの書き込みやネットワークからの読み取りなどのio操作で解放されるため、これはGILにもかかわらず実行できます）。基本的に、通常のrequestsライブラリ（requests-futuresではない）を使用して、リクエストごとにフューチャー/スレッドとファイル処理を作成します。

書き込み中にチャンクをダウンロードし続けるなど、さらに高速化する方法は他にもあります（つまり、2つのスレッド、1つは要求用、もう1つはファイル処理用）。そして、multi-partリクエストを行うことでチャンクを並列に読み取ります。これは「ダウンロードアクセラレータ」の領域であり、コードにそのような複雑さを望まない場合があります。

編集：また、チャンクダウンロードは遅延します。つまり、最初のリクエストを並行して行うだけですが、実際のチャンクファイルのダウンロードはメインスレッドで行われるため、順次行われます。したがって、現在のアプローチは完全に同期よりもはるかに優れています。上記のアドバイスはまだ有効です。

A. Smoliak · Answer

コードを操作するために、次の順序でいくつかのWebサイトからのいくつかの.csvファイルへのリンクを含むrobots.txtファイルを作成しました：GitHub 、UDemy、YouTube。

デバッグ後、最初の結果は

response = result.result()

だった（この順序で）：UDemy 、YouTube、GitHub 。レコードについては、各robots.txtのサイズは、結果を取得したのと同じ順序で増加しています。つまり、.csvファイルを特定の順序で設定したにもかかわらず、最初に問題がなかったことを意味します。結果は、ファイルが最初にダウンロードされた順序で発生しました。

パフォーマンスを改善する方法に関する他の提案をいただければ幸いです。

パフォーマンスに関しては、ファイルへの応答を書き込むためのスレッドを作成するか、非同期を使用することで速度を向上させることができますIO Tinche/aiofiles などのライブラリ。

さらに進めたい場合は、Pythonの代替実装を使用して、プログラム自体のパフォーマンスを改善してみてください PyPy など。

Nathan Vērzemnieks · Answer

これを行う最も簡単な方法は、スレッド化や特別な非同期コードを必要としません。通常のrequestsライブラリとその組み込みストリーミングオプションを使用するだけです。 response = session.get(url, stream=True)と言い、次にresponse.iter_content(chunk_size=1024)（たとえば）を使用して、ダウンロードされた情報に一度に1つのチャンクにアクセスします。次に機能例を示します。

import requests import os def stream_multiple(urls): responses = {url: requests.get(url, stream=True) for url in urls) streams = {url: responses[url].iter_content(chunk_size=1024) for url in urls} handles = {url: open(os.path.basename(url), 'wb') for url in urls} while streams: for url in list(streams.keys()): try: chunk = next(streams[url]) print("Received {} bytes for {}".format(len(chunk), url)) handles[url].write(chunk) except StopIteration: # no more contenet handles[url].close() streams.pop(url)

出力例：

rat@pandion:~/tmp$ python smu.py Received 1296 bytes for http://www.gutenberg.org/files/9490/9490-0.txt Received 1882 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8 Received 1524 bytes for http://www.gutenberg.org/files/1729/1729-0.txt Received 1508 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8 Received 1826 bytes for http://www.gutenberg.org/files/9490/9490-0.txt Received 2349 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8 Received 1834 bytes for http://www.gutenberg.org/files/1729/1729-0.txt Received 1838 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8 Received 2009 bytes for http://www.gutenberg.org/files/9490/9490-0.txt ...

スレッドまたはマルチプロセッシングを使用すると、わずかに高速なパフォーマンスを達成できる可能性がありますが、それで大幅に改善されるとは思えません。事実上すべての場合において、ディスクへのデータの書き込みは、ネットワークからのデータの受信よりもはるかに高速になります。

ADR · Answer

「モンキーパッチ」を気にしないのであればgeventを使うことができます

import gevent.monkey import requests CONNECTIONS = 10 gevent.monkey.patch_all() # debug in PyCharm: https://blog.jetbrains.com/pycharm/2012/08/gevent-debug-support/ import gevent.pool def your_request_without_any_changes(url): return requests.get(url) pool = gevent.pool.Pool(CONNECTIONS) for response in pool.imap_unordered(your_request_without_any_changes, ['http://www.google.com'] * 100): print(response.status_code)

gevent応答を待っているときに別のタスクに切り替えるには、「イベントループ」とパッチリクエストライブラリ（実際にはより低いレベルで発生します）を使用します。