With Recursive
The Postgresql documentation has various code examples.
The basic structure of a recursive query is
WITH RECURSIVE recursive_table(init, next, ...) AS (
SELECT init, next, ...
FROM graph_table
UNION
SELECT recursive_table.init, graph_table.next, ...
FROM graph_table JOIN recursive_table ON recursive_table.next = graph_table.init
)
SELECT ...
FROM recursive_table
WHERE init = 'Start'
AND next = 'End';
The base query can’t reference mytable while the recursive query joins mytable and graph_table to make a transitive closure.
These examples are taken from edX’s Databases: OLAP and Recursion free online course.
I put a Prolog version swish.
The Stanford course makes a digression into how nice it would be if the recursive query could be if the recursive query could be written from mytable R1, mytable R2
, but trying that causes Postgresql to barf:
ERROR: recursive reference to query "mytable" must not appear more than once
Family Tree example
create table ParentOf(parent text, child text);
insert into ParentOf values ('Alice', 'Carol');
insert into ParentOf values ('Bob', 'Carol');
insert into ParentOf values ('Carol', 'Dave');
insert into ParentOf values ('Carol', 'George');
insert into ParentOf values ('Dave', 'Mary');
insert into ParentOf values ('Eve', 'Mary');
insert into ParentOf values ('Mary', 'Frank');
I’ve rewritten the provided example to explicitly use a join between the recursive table’s child and the “normal” table’s parent.
This goes “up” the tree to ancestors (Widden used alias a for parent and alias d for child, but I decluttered by simply using the provided column names).
WITH RECURSIVE ancestor(parent, child) AS (
SELECT parent, child
FROM ParentOf
UNION
SELECT Ancestor.parent, ParentOf.child
FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT parent
FROM ancestor
WHERE child = 'Frank';
And this goes “down” the tree:
WITH RECURSIVE ancestor(parent, child) AS (
SELECT parent, child
FROM ParentOf
UNION
SELECT Ancestor.parent, ParentOf.child
FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT child
FROM ancestor
WHERE parent = 'Alice';
Paths
Typically we want to get a route. If we know the “start” and “end” nodes, we can get the path like this:
WITH RECURSIVE ancestor(parent, child, path) AS (
SELECT parent, child, ARRAY[parent, child]
FROM ParentOf
UNION
SELECT Ancestor.parent, ParentOf.child, path || ParentOf.child
FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT path
FROM ancestor
WHERE parent = 'Alice'
AND child = 'Frank';
Though there aren’t any cycles in this family tree, once we have a path, it’s easy to add cycle detection to the basic template:
WITH RECURSIVE ancestor(parent, child, path) AS (
SELECT parent, child, ARRAY[parent, child]
FROM ParentOf
UNION
SELECT Ancestor.parent, ParentOf.child, path || ParentOf.child
FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
WHERE NOT ParentOf.child = ANY (path)
)
SELECT path
FROM ancestor
WHERE parent = 'Alice'
AND child = 'Frank';
If we have a fixed root (ie “top”) node, we can hardwire it by moving WHERE parent = 'Alice'
to the base case:
WITH RECURSIVE ancestor(parent, child, path) AS (
SELECT parent, child, ARRAY[parent, child]
FROM ParentOf
WHERE parent = 'Alice'
UNION
SELECT Ancestor.parent, ParentOf.child, path || ParentOf.child
FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT path FROM ancestor
WHERE child = 'Frank';
Doing the same for a leaf (ie “bottom”) node into the base case involves reversing the direction of the transitive closure, and in turn the path order:
WITH RECURSIVE ancestor(child, parent, path) AS (
SELECT child, parent, ARRAY[child, parent]
FROM ParentOf
WHERE child = 'Frank'
UNION
SELECT Ancestor.child, ParentOf.parent, path || ParentOf.parent
FROM Ancestor JOIN ParentOf ON Ancestor.parent = ParentOf.child
)
SELECT path FROM ancestor
WHERE parent = 'Alice';
Or the starting “bottom” node can be provided outside the “with recursive” table:
WITH RECURSIVE ancestor(child, parent, path) AS (
SELECT child, parent, ARRAY[child, parent]
FROM ParentOf
UNION
SELECT Ancestor.child, ParentOf.parent, path || ParentOf.parent
FROM Ancestor JOIN ParentOf ON Ancestor.parent = ParentOf.child
)
SELECT path FROM ancestor
WHERE child = 'Frank'
AND parent = 'Alice';
Hierarchy
create table Employee(ID int, salary int);
create table Manager(mID int, eID int);
create table Project(name text, mgrID int);
insert into Employee values (123, 100);
insert into Employee values (234, 90);
insert into Employee values (345, 80);
insert into Employee values (456, 70);
insert into Employee values (567, 60);
insert into Manager values (123, 234);
insert into Manager values (234, 345);
insert into Manager values (234, 456);
insert into Manager values (345, 567);
insert into Project values ('X', 123);
insert into Project values ('Y', 234);
insert into Project values ('Z', 456);
Find total salary cost of project ‘X’
Note the recursive call does not include the starting node (top project manager in this example) so needs to be added.
WITH RECURSIVE superior(mID, eID) AS (
SELECT mID, eID
FROM Manager
UNION
SELECT superior.mID, Manager.eID
FROM superior JOIN manager ON superior.eID = manager.mID
)
SELECT sum(salary) FROM (
SELECT ID, salary FROM Employee WHERE ID = (SELECT mgrID FROM Project WHERE name = 'X')
UNION
SELECT eID AS ID, salary
FROM superior JOIN Employee ON Employee.ID = superior.eID
WHERE mID = (SELECT mgrID FROM Project WHERE name = 'X')
);
Find total salary cost of project ‘Y’ and ‘Z’
WITH RECURSIVE superior(mID, eID) AS (
SELECT mID, eID
FROM Manager
UNION
SELECT superior.mID, Manager.eID
FROM superior JOIN manager ON superior.eID = manager.mID
)
SELECT 'Y-total' AS project_name, sum(salary) FROM (
SELECT ID, salary FROM Employee WHERE ID = (SELECT mgrID FROM Project WHERE name = 'Y')
UNION
SELECT eID AS ID, salary
FROM superior JOIN Employee ON Employee.ID = superior.eID
WHERE mID = (SELECT mgrID FROM Project WHERE name = 'Y')
)
UNION
SELECT 'Z-total' AS project_name, sum(salary) FROM (
SELECT ID, salary FROM Employee WHERE ID = (SELECT mgrID FROM Project WHERE name = 'Z')
UNION
SELECT eID AS ID, salary
FROM superior JOIN Employee ON Employee.ID = superior.eID
WHERE mID = (SELECT mgrID FROM Project WHERE name = 'Z')
);
Airline flights
create table Flight(orig text, dest text, airline text, cost int);
insert into Flight values ('A', 'ORD', 'United', 200);
insert into Flight values ('ORD', 'B', 'American', 100);
insert into Flight values ('A', 'PHX', 'Southwest', 25);
insert into Flight values ('PHX', 'LAS', 'Southwest', 30);
insert into Flight values ('LAS', 'CMH', 'Frontier', 60);
insert into Flight values ('CMH', 'B', 'Frontier', 60);
insert into Flight values ('A', 'B', 'JetBlue', 195);
First find all costs
WITH RECURSIVE route(orig, dest, total, path) AS (
SELECT orig, dest, cost, ARRAY[orig, dest]
FROM Flight
UNION
SELECT route.orig, Flight.dest, total + cost, path || Flight.dest
FROM Flight JOIN route ON route.dest = flight.orig
)
SELECT path, total
FROM route
WHERE orig = 'A'
AND dest = 'B'
ORDER BY total;
Guarding against cycles
insert into Flight values ('CMH', 'PHX', 'Frontier', 80);
Using Cycle Detection from the Postgresql documentation as a a template:
WITH RECURSIVE route(orig, dest, is_cycle, total, path) AS (
SELECT orig, dest, false, cost, ARRAY[orig, dest]
FROM Flight
UNION
SELECT route.orig, Flight.dest, Flight.dest = ANY(path), total + cost, path || Flight.dest
FROM Flight JOIN route ON route.dest = flight.orig
WHERE NOT is_cycle
)
SELECT path, total
FROM route
WHERE orig = 'A'
AND dest = 'B'
ORDER BY total;
Rather than add the is_cycle paramater to select, which I think the document uses so as to introduce Postgresql’s “built-in syntax to simplify cycle detection”, I find it easier to make this a WHERE subquery.
WITH RECURSIVE route(orig, dest, total, path) AS (
SELECT orig, dest, cost, ARRAY[orig, dest]
FROM Flight
UNION
SELECT route.orig, Flight.dest, total + cost, path || Flight.dest
FROM Flight JOIN route ON route.dest = flight.orig
WHERE NOT Flight.dest = ANY(path)
)
SELECT path, total
FROM route
WHERE orig = 'A'
AND dest = 'B'
ORDER BY total;