Combines results of two queries (removes duplicates)
SELECT col FROM table1 UNION SELECT col FROM table2;
UNION ALL
query1 UNION ALL query2
Combines results of two queries (includes duplicates)
SELECT col FROM table1 UNION ALL SELECT col FROM table2;
INTERSECT
query1 INTERSECT query2
Returns common rows from both queries
SELECT col FROM table1 INTERSECT SELECT col FROM table2;
EXCEPT
query1 EXCEPT query2
Returns rows from the first query not in the second
SELECT col FROM table1 EXCEPT SELECT col FROM table2;
7. Aggregate Functions (Operators)
Function
Syntax
Description
Example
COUNT
COUNT(*) or COUNT(col)
Counts rows or non-NULL values
SELECT COUNT(*) FROM table;
SUM
SUM(col)
Sums up numeric values
SELECT SUM(sales) FROM table;
AVG
AVG(col)
Calculates the average
SELECT AVG(price) FROM table;
MIN
MIN(col)
Finds the minimum value
SELECT MIN(price) FROM table;
MAX
MAX(col)
Finds the maximum value
SELECT MAX(price) FROM table;
GROUP BY
GROUP BY col
Groups rows based on a column value
SELECT category, SUM(sales) FROM table GROUP BY category;
8. Window Functions
Function
Syntax
Description
Example
ROW_NUMBER
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
Assigns a unique number to each row within a partition
SELECT ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales) AS rank FROM table;
RANK
RANK() OVER (PARTITION BY col ORDER BY col2)
Assigns rank to rows with gaps for duplicates
SELECT RANK() OVER (PARTITION BY category ORDER BY sales DESC) AS rank FROM table;
DENSE_RANK
DENSE_RANK() OVER (PARTITION BY col ORDER BY col2)
Assigns rank to rows without gaps
SELECT DENSE_RANK() OVER (PARTITION BY category ORDER BY sales DESC) AS rank FROM table;
NTILE
NTILE(n) OVER (PARTITION BY col ORDER BY col2)
Divides rows into n buckets
SELECT NTILE(4) OVER (ORDER BY sales) AS quartile FROM table;
LEAD
LEAD(col, offset, default) OVER (PARTITION BY col ORDER BY col2)
Accesses the value of the next row
SELECT LEAD(sales, 1, 0) OVER (ORDER BY date) FROM table;
LAG
LAG(col, offset, default) OVER (PARTITION BY col ORDER BY col2)
Accesses the value of the previous row
SELECT LAG(sales, 1, 0) OVER (ORDER BY date) FROM table;
9. Miscellaneous Operators
Operator
Syntax
Description
Example
CAST
CAST(expr AS type)
Converts a value to a specified type
SELECT CAST(price AS STRING) FROM table;
CASE
CASE WHEN condition THEN result ELSE result END
Conditional logic
SELECT CASE WHEN sales > 100 THEN 'High' ELSE 'Low' END AS category FROM table;
DISTINCT
DISTINCT col
Returns unique values
SELECT DISTINCT category FROM table;
This cheatsheet provides a comprehensive overview of commonly used Spark SQL operators and functions with their syntax, descriptions, and examples. Use it as a reference to efficiently write and optimize Spark SQL queries!
Leave a Reply