解析json数组的一些操作~做个笔记,备查
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("pysaprk").getOrCreate()
构造测试数据
import json
import pandas as pd
json_str = '{"class":"三班","nums":40,"data":[{"name":"小明","age":10},{"name":"小红","age":9}],"data1":{"jack":10,"leo":9}}'
json.loads(json_str)
{'class': '三班',
'nums': 40,
'data': [{'name': '小明', 'age': 10}, {'name': '小红', 'age': 9}],
'data1': {'jack': 10, 'leo': 9}}
spark_df = spark.createDataFrame(pd.DataFrame({'x': [json_str]}))
spark_df.printSchema()
spark_df.show(truncate=False)
spark_df.createOrReplaceTempView('test')
root
|-- x: string (nullable = true)
+----------------------------------------------------------------------------------------------------------------+
|x |
+----------------------------------------------------------------------------------------------------------------+
|{"class":"三班","nums":40,"data":[{"name":"小明","age":10},{"name":"小红","age":9}],"data1":{"jack":10,"leo":9}}|
+----------------------------------------------------------------------------------------------------------------+
get_json_object和json_tuple的使用
简单k-v格式的提取,废话不多说,看码:
sql = """
SELECT
get_json_object(x,'$.class') as class,
get_json_object(x,'$.nums') as nums,
get_json_object(x,'$.data1.jack') as jack_age,
json_tuple(x,'class','nums','data1','data1.leo') as (class1,nums1,data1,jack_age1)
FROM test
"""
ddd = spark.sql(sql)
ddd.printSchema()
ddd.show(truncate=True)
root
|-- class: string (nullable = true)
|-- nums: string (nullable = true)
|-- jack_age: string (nullable = true)
|-- class1: string (nullable = true)
|-- nums1: string (nullable = true)
|-- data1: string (nullable = true)
|-- jack_age1: string (nullable = true)
+-----+----+--------+------+-----+-------------------+---------+
|class|nums|jack_age|class1|nums1| data1|jack_age1|
+-----+----+--------+------+-----+-------------------+---------+
| 三班| 40| 10| 三班| 40|{"jack":10,"leo":9}| null|
+-----+----+--------+------+-----+-------------------+---------+
- json_tuple只能解析第一层key,但能同时解析多个字段,少打点字儿
- get_json_object支持嵌套、取多层的操作
解析json数组
pd.DataFrame(json.loads(json_str)['data'])
name | age | |
0 | 小明 | 10 |
1 | 小红 | 9 |
json数组一行转多行
json.loads(json_str)
{'class': '三班',
'nums': 40,
'data': [{'name': '小明', 'age': 10}, {'name': '小红', 'age': 9}],
'data1': {'jack': 10, 'leo': 9}}
sql = """
SELECT
j_column,
get_json_object(j_column,'$.name') as name,
get_json_object(j_column,'$.age') as age
FROM
(
SELECT
split(regexp_replace(regexp_replace(get_json_object(x,'$.data'), '\\\\[|\\\\]',''),'\\\\},\\\\{','}_ff_{'),'_ff_')AS data
FROM test
) as a
lateral view explode(data) b as j_column
"""
ddd = spark.sql(sql)
ddd.printSchema()
ddd.show(truncate=False)
root
|-- j_column: string (nullable = true)
|-- name: string (nullable = true)
|-- age: string (nullable = true)
+------------------------+----+---+
|j_column |name|age|
+------------------------+----+---+
|{"name":"小明","age":10}|小明|10 |
|{"name":"小红","age":9} |小红|9 |
+------------------------+----+---+
一组json的行转列
{'jack': 10, 'leo': 9}
行转列,且增加列名
sql = """
SELECT
j_column,
split(j_column,':')[0] as name,
split(j_column,':')[1] as age
FROM
(
SELECT
split(regexp_replace(get_json_object(x,'$.data1'), '\\\\{|\\\\}',''),',')AS data
FROM test
) as a
lateral view explode(data) b as j_column
"""
ddd = spark.sql(sql)
ddd.printSchema()
ddd.show(truncate=False)
root
|-- j_column: string (nullable = true)
|-- name: string (nullable = true)
|-- age: string (nullable = true)
+---------+------+---+
|j_column |name |age|
+---------+------+---+
|"jack":10|"jack"|10 |
|"leo":9 |"leo" |9 |
+---------+------+---+
常用的解析json场景,后续有新场景,再更
2022-11-26 阴 于南京市江宁区