from pyspark.sql import SparkSession

# Load data from a CSV
file_location = "/FileStore/tables/df_panel_fix.csv"
df = spark.read.format("CSV").option("inferSchema", True).option("header", True).load(file_location)
display(df.take(5))
_c0provincespecificgeneralyeargdpfdirnrrrifrregit
0Anhui147002.0null19962093.3506610.00.00.01128873East China631930
1Anhui151981.0null19972347.32434430.00.00.01356287East China657860
2Anhui174930.0null19982542.96276730.00.00.01518236East China889463
3Anhui285324.0null19992712.3426131nullnullnull1646891East China1227364
4Anhui195580.032100.020002902.09318470.00.00.01601508East China1499110

Filtering on values in a column

df.filter("specific<10000").show()
+---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+ _c0|province|specific|general|year| gdp| fdi|rnr| rr| i| fr| reg| it| +---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+ 268|Shanghai| 8964.0| null|2000|4771.17|316014|0.0|0.0|0.44|2224124|East China|1212473| 269|Shanghai| 9834.0| null|2001|5210.12|429159|0.0|0.0|0.44|2947285|East China|1053917| +---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+
df.filter("specific<10000").select('province').show()
+--------+ province| +--------+ Shanghai| Shanghai| +--------+
df.filter("specific<10000").select(['province','year']).show()
+--------+----+ province|year| +--------+----+ Shanghai|2000| Shanghai|2001| +--------+----+
df.filter(df["specific"] < 10000).show()
+---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+ _c0|province|specific|general|year| gdp| fdi|rnr| rr| i| fr| reg| it| +---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+ 268|Shanghai| 8964.0| null|2000|4771.17|316014|0.0|0.0|0.44|2224124|East China|1212473| 269|Shanghai| 9834.0| null|2001|5210.12|429159|0.0|0.0|0.44|2947285|East China|1053917| +---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+

Filtering on values in 2+ columns

df.filter((df["specific"] < 55000) & (df['gdp'] > 200) ).show()
+---+--------+--------+-------+----+--------+------+----+----+----+-------+-------------------+-------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +---+--------+--------+-------+----+--------+------+----+----+----+-------+-------------------+-------+ 98| Hainan| 54462.0| null|1998| 442.13| 71715|null|null|null| 236461|South Central China| 177748| 216| Ningxia| 32088.0| null|1996| 202.9| 2826|null|null|null| 90805| Northwest China| 178668| 217| Ningxia| 44267.0| null|1997| 224.59| 671|null|null|null| 102083| Northwest China| 195295| 268|Shanghai| 8964.0| null|2000| 4771.17|316014| 0.0| 0.0|0.44|2224124| East China|1212473| 269|Shanghai| 9834.0| null|2001| 5210.12|429159| 0.0| 0.0|0.44|2947285| East China|1053917| 270|Shanghai| 19985.0| null|2002| 5741.03|427229| 0.0| 0.0|0.44|3380397| East China|1572208| 271|Shanghai| 23547.0| null|2003| 6694.23|546849| 0.0|0.53| 0.0|4461153| East China|2031496| 272|Shanghai| 29943.0| null|2004| 8072.83|654100| 0.0|0.53| 0.0| null| East China|2703643| 273|Shanghai| 29943.0| null|2005| 9247.66|685000| 0.0|0.53| 0.0| null| East China|2140461| 274|Shanghai| 42928.0| null|2006|10572.24|710700| 0.0|0.53| 0.0|8175966| East China|2239987| 302| Tianjin| 39364.0| null|1998| 1374.6|211361|null|null|null| 540178| North China| 361723| 303| Tianjin| 45463.0| null|1999| 1500.95|176399| 0.0| 0.0| 0.0| 605662| North China| 422522| 304| Tianjin| 51821.0| null|2000| 1701.88|116601| 0.0| 0.0| 0.0| 757464| North China| 547120| 305| Tianjin| 35084.0| null|2001| 1919.09|213348| 0.0| 0.0| 0.0| 942763| North China| 688810| +---+--------+--------+-------+----+--------+------+----+----+----+-------+-------------------+-------+
df.filter((df["specific"] < 55000) | (df['gdp'] > 20000) ).show()
+---+---------+---------+--------+----+--------+-------+--------------------+----+-----------+--------+-------------------+-------+ _c0| province| specific| general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +---+---------+---------+--------+----+--------+-------+--------------------+----+-----------+--------+-------------------+-------+ 69|Guangdong|1491588.0| null|2005|22557.37|1236400|0.027027027000000002| 0.0| 0.0| null|South Central China|4327217| 70|Guangdong|1897575.0|498913.0|2006|26587.76|1451065|0.027027027000000002| 0.0| 0.0|16804703|South Central China|4559252| 71|Guangdong| 859482.0| 0.0|2007|31777.01|1712603|0.027027027000000002| 0.0| 0.0|27858007|South Central China|4947824| 98| Hainan| 54462.0| null|1998| 442.13| 71715| null|null| null| 236461|South Central China| 177748| 179| Jiangsu|1188989.0| 0.0|2007|21742.05|1743140| 0.0| 0.0|0.275862069|22377276| East China|3557071| 216| Ningxia| 32088.0| null|1996| 202.9| 2826| null|null| null| 90805| Northwest China| 178668| 217| Ningxia| 44267.0| null|1997| 224.59| 671| null|null| null| 102083| Northwest China| 195295| 228| Qinghai| 37976.0| null|1996| 184.17| 576| null|null| null| 73260| Northwest China| 218361| 262| Shandong|1204547.0|112137.0|2006|21900.19|1000069| 0.0| 0.0| 0.0|11673659| East China|5304833| 263| Shandong|2121243.0|581800.0|2007|25776.91|1101159| 0.0| 0.0| 0.0|16753980| East China|6357869| 268| Shanghai| 8964.0| null|2000| 4771.17| 316014| 0.0| 0.0| 0.44| 2224124| East China|1212473| 269| Shanghai| 9834.0| null|2001| 5210.12| 429159| 0.0| 0.0| 0.44| 2947285| East China|1053917| 270| Shanghai| 19985.0| null|2002| 5741.03| 427229| 0.0| 0.0| 0.44| 3380397| East China|1572208| 271| Shanghai| 23547.0| null|2003| 6694.23| 546849| 0.0|0.53| 0.0| 4461153| East China|2031496| 272| Shanghai| 29943.0| null|2004| 8072.83| 654100| 0.0|0.53| 0.0| null| East China|2703643| 273| Shanghai| 29943.0| null|2005| 9247.66| 685000| 0.0|0.53| 0.0| null| East China|2140461| 274| Shanghai| 42928.0| null|2006|10572.24| 710700| 0.0|0.53| 0.0| 8175966| East China|2239987| 302| Tianjin| 39364.0| null|1998| 1374.6| 211361| null|null| null| 540178| North China| 361723| 303| Tianjin| 45463.0| null|1999| 1500.95| 176399| 0.0| 0.0| 0.0| 605662| North China| 422522| 304| Tianjin| 51821.0| null|2000| 1701.88| 116601| 0.0| 0.0| 0.0| 757464| North China| 547120| +---+---------+---------+--------+----+--------+-------+--------------------+----+-----------+--------+-------------------+-------+ only showing top 20 rows
df.filter((df["specific"] < 55000) & ~(df['gdp'] > 20000) ).show()
+---+--------+--------+-------+----+--------+------+-----------+----+----+-------+-------------------+-------+ _c0|province|specific|general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +---+--------+--------+-------+----+--------+------+-----------+----+----+-------+-------------------+-------+ 98| Hainan| 54462.0| null|1998| 442.13| 71715| null|null|null| 236461|South Central China| 177748| 216| Ningxia| 32088.0| null|1996| 202.9| 2826| null|null|null| 90805| Northwest China| 178668| 217| Ningxia| 44267.0| null|1997| 224.59| 671| null|null|null| 102083| Northwest China| 195295| 228| Qinghai| 37976.0| null|1996| 184.17| 576| null|null|null| 73260| Northwest China| 218361| 268|Shanghai| 8964.0| null|2000| 4771.17|316014| 0.0| 0.0|0.44|2224124| East China|1212473| 269|Shanghai| 9834.0| null|2001| 5210.12|429159| 0.0| 0.0|0.44|2947285| East China|1053917| 270|Shanghai| 19985.0| null|2002| 5741.03|427229| 0.0| 0.0|0.44|3380397| East China|1572208| 271|Shanghai| 23547.0| null|2003| 6694.23|546849| 0.0|0.53| 0.0|4461153| East China|2031496| 272|Shanghai| 29943.0| null|2004| 8072.83|654100| 0.0|0.53| 0.0| null| East China|2703643| 273|Shanghai| 29943.0| null|2005| 9247.66|685000| 0.0|0.53| 0.0| null| East China|2140461| 274|Shanghai| 42928.0| null|2006|10572.24|710700| 0.0|0.53| 0.0|8175966| East China|2239987| 302| Tianjin| 39364.0| null|1998| 1374.6|211361| null|null|null| 540178| North China| 361723| 303| Tianjin| 45463.0| null|1999| 1500.95|176399| 0.0| 0.0| 0.0| 605662| North China| 422522| 304| Tianjin| 51821.0| null|2000| 1701.88|116601| 0.0| 0.0| 0.0| 757464| North China| 547120| 305| Tianjin| 35084.0| null|2001| 1919.09|213348| 0.0| 0.0| 0.0| 942763| North China| 688810| 312| Tibet| 18829.0| null|1996| 64.98| 679|0.181818182| 0.0| 0.0| 27801| Southwest China| 306114| 313| Tibet| 25185.0| null|1997| 77.24| 63|0.181818182| 0.0| 0.0| 33787| Southwest China| 346368| 314| Tibet| 48197.0| null|1998| 91.5| 481| 0.0|0.24| 0.0| 3810| Southwest China| 415547| +---+--------+--------+-------+----+--------+------+-----------+----+----+-------+-------------------+-------+
df.filter(df["specific"] == 8964.0).show()
+---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+ _c0|province|specific|general|year| gdp| fdi|rnr| rr| i| fr| reg| it| +---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+ 268|Shanghai| 8964.0| null|2000|4771.17|316014|0.0|0.0|0.44|2224124|East China|1212473| +---+--------+--------+-------+----+-------+------+---+---+----+-------+----------+-------+
df.filter(df["province"] == "Zhejiang").show()
+---+--------+---------+--------+----+--------+-------+-----------+-----------+-----------+--------+----------+-------+ _c0|province| specific| general|year| gdp| fdi| rnr| rr| i| fr| reg| it| +---+--------+---------+--------+----+--------+-------+-----------+-----------+-----------+--------+----------+-------+ 348|Zhejiang| 273253.0| null|1996| 4188.53| 152021| 0.0| 0.0| 0.0| 1291252|East China| 740327| 349|Zhejiang| 330558.0| null|1997| 4686.11| 150345| 0.0| 0.0| 0.0| 1432453|East China| 814253| 350|Zhejiang| 426756.0| null|1998| 5052.62| 131802| 0.0| 0.0| 0.0| 1761084|East China| 923455| 351|Zhejiang| 586457.0| null|1999| 5443.92| 123262| 0.0| 0.0| 0.0| 2146200|East China|1001703| 352|Zhejiang| 408151.0| null|2000| 6141.03| 161266| 0.0| 0.0| 0.0| 2955508|East China|1135215| 353|Zhejiang| 358714.0| null|2001| 6898.34| 221162| 0.0| 0.0| 0.0| 4436868|East China|1203372| 354|Zhejiang| 365437.0|321686.0|2002| 8003.67| 307610| 0.0| 0.0| 0.0| 4958329|East China|1962633| 355|Zhejiang| 391292.0|260313.0|2003| 9705.02| 498055|1.214285714|0.035714286|0.035714286| 6217715|East China|2261631| 356|Zhejiang| 656175.0|276652.0|2004| 11648.7| 668128|1.214285714|0.035714286|0.035714286| null|East China|3162299| 357|Zhejiang| 656175.0| null|2005|13417.68| 772000|1.214285714|0.035714286|0.035714286| null|East China|2370200| 358|Zhejiang|1017303.0|394795.0|2006|15718.47| 888935|1.214285714|0.035714286|0.035714286|11537149|East China|2553268| 359|Zhejiang| 844647.0| 0.0|2007|18753.73|1036576|0.047619048| 0.0| 0.0|16494981|East China|2939778| +---+--------+---------+--------+----+--------+-------+-----------+-----------+-----------+--------+----------+-------+
df.filter(df["specific"] == 8964.0).collect()
Out[15]: [Row(_c0=268, province='Shanghai', specific=8964.0, general=None, year=2000, gdp=4771.17, fdi=316014, rnr=0.0, rr=0.0, i=0.44, fr='2224124', reg='East China', it=1212473)]
result = df.filter(df["specific"] == 8964.0).collect()
type(result[0])
Out[17]: pyspark.sql.types.Row
row = result[0]
row.asDict()
Out[19]: {'_c0': 268, 'province': 'Shanghai', 'specific': 8964.0, 'general': None, 'year': 2000, 'gdp': 4771.17, 'fdi': 316014, 'rnr': 0.0, 'rr': 0.0, 'i': 0.44, 'fr': '2224124', 'reg': 'East China', 'it': 1212473}
for item in result[0]:
    print(item)
268 Shanghai 8964.0 None 2000 4771.17 316014 0.0 0.0 0.44 2224124 East China 1212473