from pyspark.sql import SparkSession

# Load data from a CSV
file_location = "/FileStore/tables/df_panel_fix.csv"
df = spark.read.format("CSV").option("inferSchema", True).option("header", True).load(file_location)
display(df.take(5))
_c0provincespecificgeneralyeargdpfdirnrrrifrregit
0Anhui147002.0null19962093.3506610.00.00.01128873East China631930
1Anhui151981.0null19972347.32434430.00.00.01356287East China657860
2Anhui174930.0null19982542.96276730.00.00.01518236East China889463
3Anhui285324.0null19992712.3426131nullnullnull1646891East China1227364
4Anhui195580.032100.020002902.09318470.00.00.01601508East China1499110
df.show()
+---+--------+---------+--------+----+-------+------+----+-----------+-----------+-------+-----------+-------+ _c0|province| specific| general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +---+--------+---------+--------+----+-------+------+----+-----------+-----------+-------+-----------+-------+ 0| Anhui| 147002.0| null|1996| 2093.3| 50661| 0.0| 0.0| 0.0|1128873| East China| 631930| 1| Anhui| 151981.0| null|1997|2347.32| 43443| 0.0| 0.0| 0.0|1356287| East China| 657860| 2| Anhui| 174930.0| null|1998|2542.96| 27673| 0.0| 0.0| 0.0|1518236| East China| 889463| 3| Anhui| 285324.0| null|1999|2712.34| 26131|null| null| null|1646891| East China|1227364| 4| Anhui| 195580.0| 32100.0|2000|2902.09| 31847| 0.0| 0.0| 0.0|1601508| East China|1499110| 5| Anhui| 250898.0| null|2001|3246.71| 33672| 0.0| 0.0| 0.0|1672445| East China|2165189| 6| Anhui| 434149.0| 66529.0|2002|3519.72| 38375| 0.0| 0.0| 0.0|1677840| East China|2404936| 7| Anhui| 619201.0| 52108.0|2003|3923.11| 36720| 0.0| 0.0| 0.0|1896479| East China|2815820| 8| Anhui| 898441.0|349699.0|2004| 4759.3| 54669| 0.0| 0.0| 0.0| null| East China|3422176| 9| Anhui| 898441.0| null|2005|5350.17| 69000| 0.0| 0.0|0.324324324| null| East China|3874846| 10| Anhui|1457872.0|279052.0|2006| 6112.5|139354| 0.0| 0.0|0.324324324|3434548| East China|5167300| 11| Anhui|2213991.0|178705.0|2007|7360.92|299892| 0.0| 0.0|0.324324324|4468640| East China|7040099| 12| Beijing| 165957.0| null|1996| 1789.2|155290|null| null| null| 634562|North China| 508135| 13| Beijing| 165957.0| null|1997|2077.09|159286| 0.0| 0.0| 0.6| 634562|North China| 569283| 14| Beijing| 245198.0| null|1998|2377.18|216800| 0.0| 0.0| 0.53| 938788|North China| 695528| 15| Beijing| 388083.0| null|1999|2678.82|197525| 0.0| 0.0| 0.53| null|North China| 944047| 16| Beijing| 281769.0|188633.0|2000|3161.66|168368| 0.0| 0.0| 0.53|1667114|North China| 757990| 17| Beijing| 441923.0| null|2001|3707.96|176818| 0.0| 0.0| 0.53|2093925|North China|1194728| 18| Beijing| 558569.0|280277.0|2002| 4315.0|172464| 0.0| 0.0| 0.53|2511249|North China|1078754| 19| Beijing| 642581.0|269596.0|2003|5007.21|219126| 0.0|0.794871795| 0.0|2823366|North China|1426600| +---+--------+---------+--------+----+-------+------+----+-----------+-----------+-------+-----------+-------+ only showing top 20 rows
df.printSchema()
root -- _c0: integer (nullable = true) -- province: string (nullable = true) -- specific: double (nullable = true) -- general: double (nullable = true) -- year: integer (nullable = true) -- gdp: double (nullable = true) -- fdi: integer (nullable = true) -- rnr: double (nullable = true) -- rr: double (nullable = true) -- i: double (nullable = true) -- fr: string (nullable = true) -- reg: string (nullable = true) -- it: integer (nullable = true)
df.columns
Out[64]: ['_c0', 'province', 'specific', 'general', 'year', 'gdp', 'fdi', 'rnr', 'rr', 'i', 'fr', 'reg', 'it']
df.describe()
Out[65]: DataFrame[summary: string, _c0: string, province: string, specific: string, general: string, year: string, gdp: string, fdi: string, rnr: string, rr: string, i: string, fr: string, reg: string, it: string]

Setting Data Schema and Data Types

from pyspark.sql.types import StructField,StringType,IntegerType,StructType
data_schema = [
StructField("_c0", IntegerType(), True)
,StructField("province", StringType(), True)
,StructField("specific", IntegerType(), True)
,StructField("general", IntegerType(), True)
,StructField("year", IntegerType(), True)
,StructField("gdp", IntegerType(), True)
,StructField("fdi", IntegerType(), True)
,StructField("rnr", IntegerType(), True)
,StructField("rr", IntegerType(), True)
,StructField("i", IntegerType(), True)
,StructField("fr", IntegerType(), True)
,StructField("reg", StringType(), True)
,StructField("it", IntegerType(), True)
]
final_struc = StructType(fields=data_schema)

Applying the Data Schema/Data Types while reading in a CSV

df = spark.read.format("CSV").schema(final_struc).load(file_location)
df.printSchema()
root -- _c0: integer (nullable = true) -- province: string (nullable = true) -- specific: integer (nullable = true) -- general: integer (nullable = true) -- year: integer (nullable = true) -- gdp: integer (nullable = true) -- fdi: integer (nullable = true) -- rnr: integer (nullable = true) -- rr: integer (nullable = true) -- i: integer (nullable = true) -- fr: integer (nullable = true) -- reg: string (nullable = true) -- it: integer (nullable = true)
df.show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ only showing top 20 rows
df['fr']
Out[72]: Column<b'fr'>
type(df['fr'])
Out[73]: pyspark.sql.column.Column
df.select('fr')
Out[74]: DataFrame[fr: int]
type(df.select('fr'))
Out[75]: pyspark.sql.dataframe.DataFrame
df.select('fr').show()
+-------+ fr| +-------+ null| 1128873| 1356287| 1518236| 1646891| 1601508| 1672445| 1677840| 1896479| null| null| 3434548| 4468640| 634562| 634562| 938788| null| 1667114| 2093925| 2511249| +-------+ only showing top 20 rows
df.head(2)
Out[77]: [Row(_c0=None, province='province', specific=None, general=None, year=None, gdp=None, fdi=None, rnr=None, rr=None, i=None, fr=None, reg='reg', it=None), Row(_c0=0, province='Anhui', specific=None, general=None, year=1996, gdp=None, fdi=50661, rnr=None, rr=None, i=None, fr=1128873, reg='East China', it=631930)]
df.select(['reg','fr'])
Out[78]: DataFrame[reg: string, fr: int]

Using select with RDDs

df.select(['reg','fr']).show()
+-----------+-------+ reg| fr| +-----------+-------+ reg| null| East China|1128873| East China|1356287| East China|1518236| East China|1646891| East China|1601508| East China|1672445| East China|1677840| East China|1896479| East China| null| East China| null| East China|3434548| East China|4468640| North China| 634562| North China| 634562| North China| 938788| North China| null| North China|1667114| North China|2093925| North China|2511249| +-----------+-------+ only showing top 20 rows
df.withColumn('fiscal_revenue',df['fr']).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+--------------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|fiscal_revenue| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+--------------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1128873| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 1356287| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 1518236| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 1646891| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 1601508| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 1672445| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 1677840| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 1896479| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 3434548| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 4468640| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 634562| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 634562| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 938788| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 1667114| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 2093925| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 2511249| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+--------------+ only showing top 20 rows
df.show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ only showing top 20 rows

Renaming Columns using withColumnRenamed

df.withColumnRenamed('fr','new_fiscal_revenue').show()
+----+--------+--------+-------+----+----+------+----+----+----+------------------+-----------+-------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i|new_fiscal_revenue| reg| it| +----+--------+--------+-------+----+----+------+----+----+----+------------------+-----------+-------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| 0| Anhui| null| null|1996|null| 50661|null|null|null| 1128873| East China| 631930| 1| Anhui| null| null|1997|null| 43443|null|null|null| 1356287| East China| 657860| 2| Anhui| null| null|1998|null| 27673|null|null|null| 1518236| East China| 889463| 3| Anhui| null| null|1999|null| 26131|null|null|null| 1646891| East China|1227364| 4| Anhui| null| null|2000|null| 31847|null|null|null| 1601508| East China|1499110| 5| Anhui| null| null|2001|null| 33672|null|null|null| 1672445| East China|2165189| 6| Anhui| null| null|2002|null| 38375|null|null|null| 1677840| East China|2404936| 7| Anhui| null| null|2003|null| 36720|null|null|null| 1896479| East China|2815820| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| 10| Anhui| null| null|2006|null|139354|null|null|null| 3434548| East China|5167300| 11| Anhui| null| null|2007|null|299892|null|null|null| 4468640| East China|7040099| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| 16| Beijing| null| null|2000|null|168368|null|null|null| 1667114|North China| 757990| 17| Beijing| null| null|2001|null|176818|null|null|null| 2093925|North China|1194728| 18| Beijing| null| null|2002|null|172464|null|null|null| 2511249|North China|1078754| +----+--------+--------+-------+----+----+------+----+----+----+------------------+-----------+-------+ only showing top 20 rows

New Columns by Transforming extant Columns using withColumn

df.withColumn('double_fiscal_revenue',df['fr']*2).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+---------------------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|double_fiscal_revenue| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+---------------------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 2257746| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 2712574| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 3036472| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 3293782| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 3203016| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 3344890| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 3355680| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 3792958| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 6869096| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 8937280| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 1269124| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 1269124| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 1877576| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 3334228| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 4187850| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 5022498| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+---------------------+ only showing top 20 rows
df.withColumn('add_fiscal_revenue',df['fr']+1).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+------------------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|add_fiscal_revenue| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+------------------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1128874| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 1356288| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 1518237| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 1646892| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 1601509| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 1672446| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 1677841| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 1896480| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 3434549| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 4468641| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 634563| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 634563| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 938789| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 1667115| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 2093926| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 2511250| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+------------------+ only showing top 20 rows
df.withColumn('half_fiscal_revenue',df['fr']/2).show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+-------------------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it|half_fiscal_revenue| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+-------------------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 564436.5| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 678143.5| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 759118.0| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 823445.5| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 800754.0| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 836222.5| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 838920.0| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 948239.5| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| null| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| null| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 1717274.0| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 2234320.0| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 317281.0| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 317281.0| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 469394.0| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| null| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 833557.0| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 1046962.5| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| 1255624.5| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+-------------------+ only showing top 20 rows
df.withColumn('half_fr',df['fr']/2)
Out[86]: DataFrame[_c0: int, province: string, specific: int, general: int, year: int, gdp: int, fdi: int, rnr: int, rr: int, i: int, fr: int, reg: string, it: int, half_fr: double]

Spark SQL for SQL functionality using createOrReplaceTempView

df.createOrReplaceTempView("economic_data")
sql_results = spark.sql("SELECT * FROM economic_data")
sql_results
Out[89]: DataFrame[_c0: int, province: string, specific: int, general: int, year: int, gdp: int, fdi: int, rnr: int, rr: int, i: int, fr: int, reg: string, it: int]
sql_results.show()
+----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ null|province| null| null|null|null| null|null|null|null| null| reg| null| 0| Anhui| null| null|1996|null| 50661|null|null|null|1128873| East China| 631930| 1| Anhui| null| null|1997|null| 43443|null|null|null|1356287| East China| 657860| 2| Anhui| null| null|1998|null| 27673|null|null|null|1518236| East China| 889463| 3| Anhui| null| null|1999|null| 26131|null|null|null|1646891| East China|1227364| 4| Anhui| null| null|2000|null| 31847|null|null|null|1601508| East China|1499110| 5| Anhui| null| null|2001|null| 33672|null|null|null|1672445| East China|2165189| 6| Anhui| null| null|2002|null| 38375|null|null|null|1677840| East China|2404936| 7| Anhui| null| null|2003|null| 36720|null|null|null|1896479| East China|2815820| 8| Anhui| null| null|2004|null| 54669|null|null|null| null| East China|3422176| 9| Anhui| null| null|2005|null| 69000|null|null|null| null| East China|3874846| 10| Anhui| null| null|2006|null|139354|null|null|null|3434548| East China|5167300| 11| Anhui| null| null|2007|null|299892|null|null|null|4468640| East China|7040099| 12| Beijing| null| null|1996|null|155290|null|null|null| 634562|North China| 508135| 13| Beijing| null| null|1997|null|159286|null|null|null| 634562|North China| 569283| 14| Beijing| null| null|1998|null|216800|null|null|null| 938788|North China| 695528| 15| Beijing| null| null|1999|null|197525|null|null|null| null|North China| 944047| 16| Beijing| null| null|2000|null|168368|null|null|null|1667114|North China| 757990| 17| Beijing| null| null|2001|null|176818|null|null|null|2093925|North China|1194728| 18| Beijing| null| null|2002|null|172464|null|null|null|2511249|North China|1078754| +----+--------+--------+-------+----+----+------+----+----+----+-------+-----------+-------+ only showing top 20 rows
spark.sql("SELECT * FROM economic_data WHERE fr=634562").show()
+---+--------+--------+-------+----+----+------+----+----+----+------+-----------+------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +---+--------+--------+-------+----+----+------+----+----+----+------+-----------+------+ 12| Beijing| null| null|1996|null|155290|null|null|null|634562|North China|508135| 13| Beijing| null| null|1997|null|159286|null|null|null|634562|North China|569283| +---+--------+--------+-------+----+----+------+----+----+----+------+-----------+------+