非常にネストされたJSONを使用したpandas.io.json.json_normalize

Question

私は後で分析する非常にネストされたjsonファイルをnormalizeしようとしています。私が苦労しているのは、正規化のために1つ以上のレベルに進む方法です。

pandas.io.json.json_normalize のドキュメントを確認しました。これはまさに私がやりたいことをしているからです。

その一部を正規化し、辞書の仕組みを理解することができましたが、まだそこにいません。

以下のコードでは、最初のレベルのみを取得できます。

import json import pandas as pd from pandas.io.json import json_normalize with open('authors_sample.json') as f: d = json.load(f) raw = json_normalize(d['hits']['hits']) authors = json_normalize(data = d['hits']['hits'], record_path = '_source', meta = ['_id', ['_source', 'journal'], ['_source', 'title'], ['_source', 'normalized_venue_name'] ])

以下のコードで「著者」辞書を「掘り下げ」ようとしていますが、record_path = ['_source', 'authors']は私を投げますTypeError: string indices must be integers。私が理解する限りでは json_normalizeロジックは優れているはずですが、dict対listを使用してjsonに飛び込む方法はまだよくわかりません。

私もこの単純な例を通過しました。

authors = json_normalize(data = d['hits']['hits'], record_path = ['_source', 'authors'], meta = ['_id', ['_source', 'journal'], ['_source', 'title'], ['_source', 'normalized_venue_name'] ])

以下は、jsonファイルのチャンクです（5レコード）。

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, u'hits': {u'hits': [{u'_id': u'7CB3F2AD', u'_index': u'scibase_listings', u'_score': 1.0, u'_source': {u'authors': None, u'deleted': 0, u'description': None, u'doi': u'', u'is_valid': 1, u'issue': None, u'journal': u'Physical Review Letters', u'link': None, u'meta_description': None, u'meta_keywords': None, u'normalized_venue_name': u'phys rev lett', u'pages': None, u'parent_keywords': [u'Chromatography', u'Quantum mechanics', u'Particle physics', u'Quantum field theory', u'Analytical chemistry', u'Quantum chromodynamics', u'Physics', u'Mass spectrometry', u'Chemistry'], u'pub_date': u'1987-03-02 00:00:00', u'pubtype': None, u'rating_avg_weighted': 0, u'rating_clarity': 0.0, u'rating_clarity_weighted': 0.0, u'rating_innovation': 0.0, u'rating_innovation_weighted': 0.0, u'rating_num_weighted': 0, u'rating_reproducability': 0, u'rating_reproducibility_weighted': 0.0, u'rating_versatility': 0.0, u'rating_versatility_weighted': 0.0, u'review_count': 0, u'tag': [u'mass spectra', u'elementary particles', u'bound states'], u'title': u'Evidence for a new meson: A quasinuclear NN-bar bound state', u'userAvg': 0.0, u'user_id': None, u'venue_name': u'Physical Review Letters', u'views_count': 0, u'volume': None}, u'_type': u'listing'}, {u'_id': u'7AF8EBC3', u'_index': u'scibase_listings', u'_score': 1.0, u'_source': {u'authors': [{u'affiliations': [u'Punjabi University'], u'author_id': u'780E3459', u'author_name': u'munish puri'}, {u'affiliations': [u'Punjabi University'], u'author_id': u'48D92C79', u'author_name': u'rajesh dhaliwal'}, {u'affiliations': [u'Punjabi University'], u'author_id': u'7D9BD37C', u'author_name': u'r s singh'}], u'deleted': 0, u'description': None, u'doi': u'', u'is_valid': 1, u'issue': None, u'journal': u'Journal of Industrial Microbiology & Biotechnology', u'link': None, u'meta_description': None, u'meta_keywords': None, u'normalized_venue_name': u'j ind microbiol biotechnol', u'pages': None, u'parent_keywords': [u'Nuclear medicine', u'Psychology', u'Hydrology', u'Chromatography', u'X-ray crystallography', u'Nuclear fusion', u'Medicine', u'Fluid dynamics', u'Thermodynamics', u'Physics', u'Gas chromatography', u'Radiobiology', u'Engineering', u'Organic chemistry', u'High-performance liquid chromatography', u'Chemistry', u'Organic synthesis', u'Psychotherapist'], u'pub_date': u'2008-04-04 00:00:00', u'pubtype': None, u'rating_avg_weighted': 0, u'rating_clarity': 0.0, u'rating_clarity_weighted': 0.0, u'rating_innovation': 0.0, u'rating_innovation_weighted': 0.0, u'rating_num_weighted': 0, u'rating_reproducability': 0, u'rating_reproducibility_weighted': 0.0, u'rating_versatility': 0.0, u'rating_versatility_weighted': 0.0, u'review_count': 0, u'tag': [u'flow rate', u'operant conditioning', u'packed bed reactor', u'immobilized enzyme', u'specific activity'], u'title': u'Development of a stable continuous flow immobilized enzyme reactor for the hydrolysis of inulin', u'userAvg': 0.0, u'user_id': None, u'venue_name': u'Journal of Industrial Microbiology & Biotechnology', u'views_count': 0, u'volume': None}, u'_type': u'listing'}, {u'_id': u'7521A721', u'_index': u'scibase_listings', u'_score': 1.0, u'_source': {u'authors': [{u'author_id': u'7FF872BC', u'author_name': u'barbara eileen ryan'}], u'deleted': 0, u'description': None, u'doi': u'', u'is_valid': 1, u'issue': None, u'journal': u'The American Historical Review', u'link': None, u'meta_description': None, u'meta_keywords': None, u'normalized_venue_name': u'american historical review', u'pages': None, u'parent_keywords': [u'Social science', u'Politics', u'Sociology', u'Law'], u'pub_date': u'1992-01-01 00:00:00', u'pubtype': None, u'rating_avg_weighted': 0, u'rating_clarity': 0.0, u'rating_clarity_weighted': 0.0, u'rating_innovation': 0.0, u'rating_innovation_weighted': 0.0, u'rating_num_weighted': 0, u'rating_reproducability': 0, u'rating_reproducibility_weighted': 0.0, u'rating_versatility': 0.0, u'rating_versatility_weighted': 0.0, u'review_count': 0, u'tag': [u'social movements'], u'title': u"Feminism and the women's movement : dynamics of change in social movement ideology, and activism", u'userAvg': 0.0, u'user_id': None, u'venue_name': u'The American Historical Review', u'views_count': 0, u'volume': None}, u'_type': u'listing'}, {u'_id': u'7DAEB9A4', u'_index': u'scibase_listings', u'_score': 1.0, u'_source': {u'authors': [{u'author_id': u'0299B8E9', u'author_name': u'fraser j harbutt'}], u'deleted': 0, u'description': None, u'doi': u'', u'is_valid': 1, u'issue': None, u'journal': u'The American Historical Review', u'link': None, u'meta_description': None, u'meta_keywords': None, u'normalized_venue_name': u'american historical review', u'pages': None, u'parent_keywords': [u'Superconductivity', u'Nuclear fusion', u'Geology', u'Chemistry', u'Metallurgy'], u'pub_date': u'1988-01-01 00:00:00', u'pubtype': None, u'rating_avg_weighted': 0, u'rating_clarity': 0.0, u'rating_clarity_weighted': 0.0, u'rating_innovation': 0.0, u'rating_innovation_weighted': 0.0, u'rating_num_weighted': 0, u'rating_reproducability': 0, u'rating_reproducibility_weighted': 0.0, u'rating_versatility': 0.0, u'rating_versatility_weighted': 0.0, u'review_count': 0, u'tag': [u'iron'], u'title': u'The iron curtain : Churchill, America, and the origins of the Cold War', u'userAvg': 0.0, u'user_id': None, u'venue_name': u'The American Historical Review', u'views_count': 0, u'volume': None}, u'_type': u'listing'}, {u'_id': u'7B3236C5', u'_index': u'scibase_listings', u'_score': 1.0, u'_source': {u'authors': [{u'author_id': u'7DAB7B72', u'author_name': u'richard m freeland'}], u'deleted': 0, u'description': None, u'doi': u'', u'is_valid': 1, u'issue': None, u'journal': u'The American Historical Review', u'link': None, u'meta_description': None, u'meta_keywords': None, u'normalized_venue_name': u'american historical review', u'pages': None, u'parent_keywords': [u'Political Science', u'Economics'], u'pub_date': u'1985-01-01 00:00:00', u'pubtype': None, u'rating_avg_weighted': 0, u'rating_clarity': 0.0, u'rating_clarity_weighted': 0.0, u'rating_innovation': 0.0, u'rating_innovation_weighted': 0.0, u'rating_num_weighted': 0, u'rating_reproducability': 0, u'rating_reproducibility_weighted': 0.0, u'rating_versatility': 0.0, u'rating_versatility_weighted': 0.0, u'review_count': 0, u'tag': [u'foreign policy'], u'title': u'The Truman Doctrine and the origins of McCarthyism : foreign policy, domestic politics, and internal security, 1946-1948', u'userAvg': 0.0, u'user_id': None, u'venue_name': u'The American Historical Review', u'views_count': 0, u'volume': None}, u'_type': u'listing'}], u'max_score': 1.0, u'total': 36429433}, u'timed_out': False, u'took': 170}

Martijn Pieters · Accepted Answer

pandasの例（下記）では、括弧は何を意味しますか？] []でさらに深く進むために従うべきロジックはありますか。

['state', 'shortname', ['info', 'governor']]の各要素は、含める要素へのパスです選択した行に加えて。 'counties'引数は生成される行を設定し、2番目の引数はそれらの行に含まれるメタデータを追加します。

それぞれがパスであり、リストはネストされた構造です。出力例では、state、shortname、info.governor列に対応する値が表示されます。

JSONの例では、例の'counties'のように、最初の引数で昇格するネストされたリストはほとんどありません。そのデータ構造の唯一の例は、ネストされた'authors'キーです。各['_source', 'authors']パスを抽出する必要があります。その後、親オブジェクトから他のキーを追加してそれらの行を拡張できます。

>>> json_normalize(raw, [['_source', 'authors']], ['_id', ['_source', 'journal'], ['_source', 'title']]) affiliations author_id author_name _id \ 0 NaN 166468F4 a bowdoin van riper 7FDFEB02 1 NaN 81070854 jeffrey h schwartz 7FDFEB02 2 [Pennsylvania State University] 7E15BDFA roger l geiger 7538108B _source.journal \ 0 The American Historical Review 1 The American Historical Review 2 The American Historical Review _source.title 0 Men Among the Mammoths: Victorian Science and ... 1 Men Among the Mammoths: Victorian Science and ... 2 Elizabeth Popp Berman. Creating the Market Uni...

したがって、これは著者のデータフレームであり、各著者のメタデータが追加されています（_id値、ジャーナル名、記事タイトル）。

最初の引数のパスに注意してください。ネストされたパスをリストする場合は、パスのリストを提供する必要があります（それが1つのパスであっても）。ただ['_source', 'authors']は、それぞれが単純なトップレベル名である2つの行ソースを探します。

2番目の引数は、最も外側のオブジェクトから_idキーを取得しますが、タイトルとジャーナル名はlistパスです。これらもネストされているためです。

Sander Vanden Hautte · Answer

ライブラリflatten_jsonを確認することもできます。これにより、json_normalizeのように列階層を記述する必要がなくなります。

from flatten_json import flatten data = d['hits']['hits'] dict_flattened = (flatten(record, '.') for record in data) df = pd.DataFrame(dict_flattened) print(df)

https://github.com/amirziai/flatten を参照してください。