RDDs and Schemas and Data Types with Pyspark
• 11 min read
from pyspark.sql import SparkSession
# Load data from a CSV
file_location = "/FileStore/tables/df_panel_fix.csv"
df = spark.read.format("CSV").option("inferSchema", True).option("header", True).load(file_location)
display(df.take(5))
_c0 | province | specific | general | year | gdp | fdi | rnr | rr | i | fr | reg | it |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Anhui | 147002.0 | null | 1996 | 2093.3 | 50661 | 0.0 | 0.0 | 0.0 | 1128873 | East China | 631930 |
1 | Anhui | 151981.0 | null | 1997 | 2347.32 | 43443 | 0.0 | 0.0 | 0.0 | 1356287 | East China | 657860 |
2 | Anhui | 174930.0 | null | 1998 | 2542.96 | 27673 | 0.0 | 0.0 | 0.0 | 1518236 | East China | 889463 |
3 | Anhui | 285324.0 | null | 1999 | 2712.34 | 26131 | null | null | null | 1646891 | East China | 1227364 |
4 | Anhui | 195580.0 | 32100.0 | 2000 | 2902.09 | 31847 | 0.0 | 0.0 | 0.0 | 1601508 | East China | 1499110 |
df.show()
+---+--------+---------+--------+----+-------+------+----+-----------+-----------+-------+-----------+-------+
_c0|province| specific| general|year| gdp| fdi| rnr| rr| i| fr| reg| it|
+---+--------+---------+--------+----+-------+------+----+-----------+-----------+-------+-----------+-------+
0| Anhui| 147002.0| null|1996| 2093.3| 50661| 0.0| 0.0| 0.0|1128873| East China| 631930|
1| Anhui| 151981.0| null|1997|2347.32| 43443| 0.0| 0.0| 0.0|1356287| East China| 657860|
2| Anhui| 174930.0| null|1998|2542.96| 27673| 0.0| 0.0| 0.0|1518236| East China| 889463|
3| Anhui| 285324.0| null|1999|2712.34| 26131|null| null| null|1646891| East China|1227364|
4| Anhui| 195580.0| 32100.0|2000|2902.09| 31847| 0.0| 0.0| 0.0|1601508| East China|1499110|
5| Anhui| 250898.0| null|2001|3246.71| 33672| 0.0| 0.0| 0.0|1672445| East China|2165189|
6| Anhui| 434149.0| 66529.0|2002|3519.72| 38375| 0.0| 0.0| 0.0|1677840| East China|2404936|
7| Anhui| 619201.0| 52108.0|2003|3923.11| 36720| 0.0| 0.0| 0.0|1896479| East China|2815820|
8| Anhui| 898441.0|349699.0|2004| 4759.3| 54669| 0.0| 0.0| 0.0| null| East China|3422176|
9| Anhui| 898441.0| null|2005|5350.17| 69000| 0.0| 0.0|0.324324324| null| East China|3874846|
10| Anhui|1457872.0|279052.0|2006| 6112.5|139354| 0.0| 0.0|0.324324324|3434548| East China|5167300|
11| Anhui|2213991.0|178705.0|2007|7360.92|299892| 0.0| 0.0|0.324324324|4468640| East China|7040099|
12| Beijing| 165957.0| null|1996| 1789.2|155290|null| null| null| 634562|North China| 508135|
13| Beijing| 165957.0| null|1997|2077.09|159286| 0.0| 0.0| 0.6| 634562|North China| 569283|
14| Beijing| 245198.0| null|1998|2377.18|216800| 0.0| 0.0| 0.53| 938788|North China| 695528|
15| Beijing| 388083.0| null|1999|2678.82|197525| 0.0| 0.0| 0.53| null|North China| 944047|
16| Beijing| 281769.0|188633.0|2000|3161.66|168368| 0.0| 0.0| 0.53|1667114|North China| 757990|
17| Beijing| 441923.0| null|2001|3707.96|176818| 0.0| 0.0| 0.53|2093925|North China|1194728|
18| Beijing| 558569.0|280277.0|2002| 4315.0|172464| 0.0| 0.0| 0.53|2511249|North China|1078754|
19| Beijing| 642581.0|269596.0|2003|5007.21|219126| 0.0|0.794871795| 0.0|2823366|North China|1426600|
+---+--------+---------+--------+----+-------+------+----+-----------+-----------+-------+-----------+-------+
only showing top 20 rows
df.printSchema()
root
-- _c0: integer (nullable = true)
-- province: string (nullable = true)
-- specific: double (nullable = true)
-- general: double (nullable = true)
-- year: integer (nullable = true)
-- gdp: double (nullable = true)
-- fdi: integer (nullable = true)
-- rnr: double (nullable = true)
-- rr: double (nullable = true)
-- i: double (nullable = true)
-- fr: string (nullable = true)
-- reg: string (nullable = true)
-- it: integer (nullable = true)
df.columns
Out[64]: ['_c0',
'province',
'specific',
'general',
'year',
'gdp',
'fdi',
'rnr',
'rr',
'i',
'fr',
'reg',
'it']
df.describe()
Out[65]: DataFrame[summary: string, _c0: string, province: string, specific: string, general: string, year: string, gdp: string, fdi: string, rnr: string, rr: string, i: string, fr: string, reg: string, it: string]
from pyspark.sql.types import StructField,StringType,IntegerType,StructType
data_schema = [
StructField("_c0", IntegerType(), True)
,StructField("province", StringType(), True)
,StructField("specific", IntegerType(), True)
,StructField("general", IntegerType(), True)
,StructField("year", IntegerType(), True)
,StructField("gdp", IntegerType(), True)
,StructField("fdi", IntegerType(), True)
,StructField("rnr", IntegerType(), True)
,StructField("rr", IntegerType(), True)
,StructField("i", IntegerType(), True)
,StructField("fr", IntegerType(), True)
,StructField("reg", StringType(), True)
,StructField("it", IntegerType(), True)
]
final_struc = StructType(fields=data_schema)
df = spark.read.format("CSV").schema(final_struc).load(file_location)
df.printSchema()
root
-- _c0: integer (nullable = true)
-- province: string (nullable = true)
-- specific: integer (nullable = true)
-- general: integer (nullable = true)
-- year: integer (nullable = true)
-- gdp: integer (nullable = true)
-- fdi: integer (nullable = true)
-- rnr: integer (nullable = true)
-- rr: integer (nullable = true)
-- i: integer (nullable = true)
-- fr: integer (nullable = true)
-- reg: string (nullable = true)
-- it: integer (nullable = true)
df.show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
null|province| null| null|null|null| null|null|null|null| null| reg| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
only showing top 20 rows
df['fr']
Out[72]: Column<b'fr'>
type(df['fr'])
Out[73]: pyspark.sql.column.Column
df.select('fr')
Out[74]: DataFrame[fr: int]
type(df.select('fr'))
Out[75]: pyspark.sql.dataframe.DataFrame
df.select('fr').show()
+-------+
fr|
+-------+
null|
1128873|
1356287|
1518236|
1646891|
1601508|
1672445|
1677840|
1896479|
null|
null|
3434548|
4468640|
634562|
634562|
938788|
null|
1667114|
2093925|
2511249|
+-------+
only showing top 20 rows
df.head(2)
Out[77]: [Row(_c0=None, province='province', specific=None, general=None, year=None, gdp=None, fdi=None, rnr=None, rr=None, i=None, fr=None, reg='reg', it=None),
Row(_c0=0, province='Anhui', specific=None, general=None, year=1996, gdp=None, fdi=50661, rnr=None, rr=None, i=None, fr=1128873, reg='East China', it=631930)]
df.select(['reg','fr'])
Out[78]: DataFrame[reg: string, fr: int]
df.select(['reg','fr']).show()
+-----------+-------+
reg| fr|
+-----------+-------+
reg| null|
East China|1128873|
East China|1356287|
East China|1518236|
East China|1646891|
East China|1601508|
East China|1672445|
East China|1677840|
East China|1896479|
East China| null|
East China| null|
East China|3434548|
East China|4468640|
North China| 634562|
North China| 634562|
North China| 938788|
North China| null|
North China|1667114|
North China|2093925|
North China|2511249|
+-----------+-------+
only showing top 20 rows
df.withColumn('fiscal_revenue',df['fr']).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+--------------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|fiscal_revenue|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+--------------+
null|province| null| null|null|null| null|null|null|null| null| reg| null| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1128873|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 1356287|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 1518236|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 1646891|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 1601508|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 1672445|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 1677840|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 1896479|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 3434548|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 4468640|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 634562|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 634562|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 938788|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 1667114|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 2093925|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 2511249|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+--------------+
only showing top 20 rows
df.show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
null|province| null| null|null|null| null|null|null|null| null| reg| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
only showing top 20 rows
df.withColumnRenamed('fr','new_fiscal_revenue').show()
+----+--------+--------+-------+----+----+------+----+----+----+------------------+-----------+-------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i|new_fiscal_revenue| reg| it|
+----+--------+--------+-------+----+----+------+----+----+----+------------------+-----------+-------+
null|province| null| null|null|null| null|null|null|null| null| reg| null|
0| Anhui| null| null|1996|null| 50661|null|null|null| 1128873| East China| 631930|
1| Anhui| null| null|1997|null| 43443|null|null|null| 1356287| East China| 657860|
2| Anhui| null| null|1998|null| 27673|null|null|null| 1518236| East China| 889463|
3| Anhui| null| null|1999|null| 26131|null|null|null| 1646891| East China|1227364|
4| Anhui| null| null|2000|null| 31847|null|null|null| 1601508| East China|1499110|
5| Anhui| null| null|2001|null| 33672|null|null|null| 1672445| East China|2165189|
6| Anhui| null| null|2002|null| 38375|null|null|null| 1677840| East China|2404936|
7| Anhui| null| null|2003|null| 36720|null|null|null| 1896479| East China|2815820|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846|
10| Anhui| null| null|2006|null|139354|null|null|null| 3434548| East China|5167300|
11| Anhui| null| null|2007|null|299892|null|null|null| 4468640| East China|7040099|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047|
16| Beijing| null| null|2000|null|168368|null|null|null| 1667114|North China| 757990|
17| Beijing| null| null|2001|null|176818|null|null|null| 2093925|North China|1194728|
18| Beijing| null| null|2002|null|172464|null|null|null| 2511249|North China|1078754|
+----+--------+--------+-------+----+----+------+----+----+----+------------------+-----------+-------+
only showing top 20 rows
df.withColumn('double_fiscal_revenue',df['fr']*2).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+---------------------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|double_fiscal_revenue|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+---------------------+
null|province| null| null|null|null| null|null|null|null| null| reg| null| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 2257746|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 2712574|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 3036472|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 3293782|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 3203016|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 3344890|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 3355680|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 3792958|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 6869096|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 8937280|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 1269124|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 1269124|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 1877576|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 3334228|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 4187850|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 5022498|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+---------------------+
only showing top 20 rows
df.withColumn('add_fiscal_revenue',df['fr']+1).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+------------------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|add_fiscal_revenue|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+------------------+
null|province| null| null|null|null| null|null|null|null| null| reg| null| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1128874|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 1356288|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 1518237|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 1646892|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 1601509|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 1672446|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 1677841|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 1896480|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 3434549|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 4468641|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 634563|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 634563|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 938789|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 1667115|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 2093926|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 2511250|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+------------------+
only showing top 20 rows
df.withColumn('half_fiscal_revenue',df['fr']/2).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+-------------------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|half_fiscal_revenue|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+-------------------+
null|province| null| null|null|null| null|null|null|null| null| reg| null| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 564436.5|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 678143.5|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 759118.0|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 823445.5|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 800754.0|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 836222.5|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 838920.0|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 948239.5|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 1717274.0|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 2234320.0|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 317281.0|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 317281.0|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 469394.0|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 833557.0|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 1046962.5|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 1255624.5|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+-------------------+
only showing top 20 rows
df.withColumn('half_fr',df['fr']/2)
Out[86]: DataFrame[_c0: int, province: string, specific: int, general: int, year: int, gdp: int, fdi: int, rnr: int, rr: int, i: int, fr: int, reg: string, it: int, half_fr: double]
df.createOrReplaceTempView("economic_data")
sql_results = spark.sql("SELECT * FROM economic_data")
sql_results
Out[89]: DataFrame[_c0: int, province: string, specific: int, general: int, year: int, gdp: int, fdi: int, rnr: int, rr: int, i: int, fr: int, reg: string, it: int]
sql_results.show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
null|province| null| null|null|null| null|null|null|null| null| reg| null|
0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930|
1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860|
2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463|
3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364|
4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110|
5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189|
6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936|
7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820|
8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176|
9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846|
10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300|
11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099|
12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135|
13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283|
14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528|
15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047|
16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990|
17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728|
18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754|
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+
only showing top 20 rows
spark.sql("SELECT * FROM economic_data WHERE fr=634562").show()
+---+--------+--------+-------+----+----+------+----+----+----+------+-----------+------+
_c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|
+---+--------+--------+-------+----+----+------+----+----+----+------+-----------+------+
12| Beijing| null| null|1996|null|155290|null|null|null|634562|North China|508135|
13| Beijing| null| null|1997|null|159286|null|null|null|634562|North China|569283|
+---+--------+--------+-------+----+----+------+----+----+----+------+-----------+------+
This post includes code adapted from Spark and Python for Big Data udemy course and Spark and Python for Big Data notebooks.