Viewing Data in PySpark
1. show()
: View the first few rows
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()
data = [('Alice', 25), ('Bob', 30), ('Charlie', 35)]
columns = ['Name', 'Age']
sdf = spark.createDataFrame(data, columns)
# View the first 20 rows (default)
sdf.show()
# View the first n rows
sdf.show(10)
2. show()
with truncate
: Control column width
# Truncate long strings to 20 characters (default)
sdf.show(truncate=True)
# Do not truncate strings
sdf.show(truncate=False)
# Truncate strings to a specific length
sdf.show(truncate=5)
3. show()
with vertical
: Vertical display of rows
# Vertical display of rows
sdf.show(vertical=True)
4. printSchema()
: Print the schema of the DataFrame
# Print the schema
sdf.printSchema()
5. describe()
: Summary statistics
# Summary statistics of DataFrame
sdf.describe().show()
6. head()
: Retrieve the first row or n rows
# Retrieve the first row
print(sdf.head())
# Retrieve the first n rows
print(sdf.head(5))
7. take()
: Retrieve the first n rows
# Retrieve the first n rows as a list of Row objects
print(sdf.take(5))
8. collect()
: Retrieve all rows
# Retrieve all rows as a list of Row objects
all_rows = sdf.collect()
for row in all_rows:
print(row)
- PySpark:
show()
: Flexible method for displaying rows with options liketruncate
andvertical
.printSchema()
: For printing the schema.describe()
: For summary statistics.head()
,take()
: For retrieving specific number of rows.collect()
: For retrieving all rows.
Leave a Reply