Viewing Data
Pandas:
df.head()
df.info()
df.describe()
PySpark:
df_csv.show()
df_csv.printSchema()
df_csv.describe().show()
Viewing data is an essential part of data analysis and manipulation. Both Pandas and PySpark provide various methods for inspecting DataFrames. Below is a detailed comparison of these methods in Pandas and PySpark, including options like vertical view in PySpark’s show
method.
Viewing Data in Pandas
1. head()
: View the first few rows
import pandas as pd
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35]
}
pdf = pd.DataFrame(data)
# View the first 5 rows
print(pdf.head())
# View the first n rows
print(pdf.head(10))
2. tail()
: View the last few rows
# View the last 5 rows
print(pdf.tail())
# View the last n rows
print(pdf.tail(10))
3. info()
: Summary of the DataFrame
# Summary of DataFrame
print(pdf.info())
4. describe()
: Summary statistics
# Summary statistics of DataFrame
print(pdf.describe())
5. sample()
: Random sample of rows
# Random sample of 5 rows
print(pdf.sample(5))
# Random sample of n rows
print(pdf.sample(10))
6. iloc[]
: Access rows by integer-location based indexing
# Access rows by index
print(pdf.iloc[0:5]) # First 5 rows
7. loc[]
: Access rows by label
# Access rows by label
print(pdf.loc[0:5]) # First 5 rows
Viewing Data in PySpark
1. show()
: View the first few rows
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()
data = [('Alice', 25), ('Bob', 30), ('Charlie', 35)]
columns = ['Name', 'Age']
sdf = spark.createDataFrame(data, columns)
# View the first 20 rows (default)
sdf.show()
# View the first n rows
sdf.show(10)
2. show()
with truncate
: Control column width
# Truncate long strings to 20 characters (default)
sdf.show(truncate=True)
# Do not truncate strings
sdf.show(truncate=False)
# Truncate strings to a specific length
sdf.show(truncate=5)
3. show()
with vertical
: Vertical display of rows
# Vertical display of rows
sdf.show(vertical=True)
4. printSchema()
: Print the schema of the DataFrame
# Print the schema
sdf.printSchema()
5. describe()
: Summary statistics
# Summary statistics of DataFrame
sdf.describe().show()
6. head()
: Retrieve the first row or n rows
# Retrieve the first row
print(sdf.head())
# Retrieve the first n rows
print(sdf.head(5))
7. take()
: Retrieve the first n rows
# Retrieve the first n rows as a list of Row objects
print(sdf.take(5))
8. collect()
: Retrieve all rows
# Retrieve all rows as a list of Row objects
all_rows = sdf.collect()
for row in all_rows:
print(row)
- Pandas:
head()
,tail()
: For viewing first and last few rows.info()
,describe()
: For summary and statistics.sample()
: For random sampling.iloc[]
,loc[]
: For row-based access.
- PySpark:
show()
: Flexible method for displaying rows with options liketruncate
andvertical
.printSchema()
: For printing the schema.describe()
: For summary statistics.head()
,take()
: For retrieving specific number of rows.collect()
: For retrieving all rows.
Correlation Analysis
Pandas
import pandas as pd
# create a sample dataframe
data = {'A': [1, 2, 3, 4, 5],
'B': [2, 3, 5, 7, 11]}
df = pd.DataFrame(data)
# calculate correlation
corr = df.corr()
print(corr)
PySpark
from pyspark.sql import SparkSession
# create a spark session
spark = SparkSession.builder.appName('Correlation Analysis').getOrCreate()
# create a sample dataframe
data = [(1, 2), (2, 3), (3, 5), (4, 7), (5, 11)]
df = spark.createDataFrame(data, ['A', 'B'])
# calculate correlation
corr = df.stat.corr('A', 'B')
print(corr)
Data Visualization
Pandas
import pandas as pd
import matplotlib.pyplot as plt
# create a sample dataframe
data = {'A': [1, 2, 3, 4, 5],
'B': [2, 3, 5, 7, 11]}
df = pd.DataFrame(data)
# plot a line chart
plt.figure(figsize=(10, 6))
plt.plot(df['A'], df['B'], marker='o')
plt.title('Line Chart')
plt.xlabel('A')
plt.ylabel('B')
plt.show()
PySpark
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
# create a spark session
spark = SparkSession.builder.appName('Data Visualization').getOrCreate()
# create a sample dataframe
data = [(1, 2), (2, 3), (3, 5), (4, 7), (5, 11)]
df = spark.createDataFrame(data, ['A', 'B'])
# convert to pandas dataframe
pdf = df.toPandas()
# plot a line chart
plt.figure(figsize=(10, 6))
plt.plot(pdf['A'], pdf['B'], marker='o')
plt.title('Line Chart')
plt.xlabel('A')
plt.ylabel('B')
plt.show()
Data Analysis and DQ Operations
Pandas
import pandas as pd
# create a sample dataframe
data = {'A': [1, 2, 3, 4, 5],
'B': [2, 3, 5, 7, 11]}
df = pd.DataFrame(data)
# calculate mean
mean = df['A'].mean()
print(mean)
# calculate median
median = df['A'].median()
print(median)
# calculate standard deviation
std_dev = df['A'].std()
print(std_dev)
# calculate variance
variance = df['A'].var()
print(variance)
# drop duplicates
df.drop_duplicates(inplace=True)
# handle missing values
df.fillna(0, inplace=True)
PySpark
from pyspark.sql import SparkSession
# create a spark session
spark = SparkSession.builder.appName('Data Analysis and DQ').getOrCreate()
# create a sample dataframe
data = [(1, 2), (2, 3), (3, 5), (4, 7), (5, 11)]
df = spark.createDataFrame(data, ['A', 'B'])
# calculate mean
mean = df.agg({'A': 'mean'}).collect()[0][0]
print(mean)
# calculate median
median = df.approxQuantile('A', [0.5], 0)[0]
print(median)
# calculate standard deviation
std_dev = df.agg({'A': 'stddev'}).collect()[0][0]
print(std_dev)
# calculate variance
variance = df.agg({'A': 'variance'}).collect()[0][0]
print(variance)
# drop duplicates
df.dropDuplicates()
# handle missing values
df.fillna(0)