With Recursive

The Postgresql documentation has various code examples.

The basic structure of a recursive query is

WITH RECURSIVE recursive_table(init, next, ...) AS (
  SELECT init, next, ... 
  FROM graph_table
    UNION
  SELECT recursive_table.init, graph_table.next, ... 
  FROM graph_table JOIN recursive_table ON recursive_table.next = graph_table.init 
)
SELECT ...
FROM recursive_table
WHERE init = 'Start'
AND next = 'End';

The base query can’t reference mytable while the recursive query joins mytable and graph_table to make a transitive closure.

These examples are taken from edX’s Databases: OLAP and Recursion free online course.

I put a Prolog version swish.

The Stanford course makes a digression into how nice it would be if the recursive query could be if the recursive query could be written from mytable R1, mytable R2, but trying that causes Postgresql to barf:

ERROR:  recursive reference to query "mytable" must not appear more than once

Family Tree example

create table ParentOf(parent text, child text);

insert into ParentOf values ('Alice', 'Carol');
insert into ParentOf values ('Bob', 'Carol');
insert into ParentOf values ('Carol', 'Dave');
insert into ParentOf values ('Carol', 'George');
insert into ParentOf values ('Dave', 'Mary');
insert into ParentOf values ('Eve', 'Mary');
insert into ParentOf values ('Mary', 'Frank');

I’ve rewritten the provided example to explicitly use a join between the recursive table’s child and the “normal” table’s parent.

This goes “up” the tree to ancestors (Widden used alias a for parent and alias d for child, but I decluttered by simply using the provided column names).

WITH RECURSIVE ancestor(parent, child) AS (
  SELECT parent, child
  FROM ParentOf
    UNION
  SELECT Ancestor.parent, ParentOf.child
  FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT parent
FROM ancestor
WHERE child = 'Frank';

And this goes “down” the tree:

WITH RECURSIVE ancestor(parent, child) AS (
  SELECT parent, child
  FROM ParentOf
    UNION
  SELECT Ancestor.parent, ParentOf.child
  FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT child
FROM ancestor
WHERE parent = 'Alice';

Paths

Typically we want to get a route. If we know the “start” and “end” nodes, we can get the path like this:

WITH RECURSIVE ancestor(parent, child, path) AS (
  SELECT parent, child, ARRAY[parent, child]
  FROM ParentOf
    UNION
  SELECT Ancestor.parent, ParentOf.child, path || ParentOf.child
  FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT path
FROM ancestor
WHERE parent = 'Alice'
AND child = 'Frank';

Though there aren’t any cycles in this family tree, once we have a path, it’s easy to add cycle detection to the basic template:

WITH RECURSIVE ancestor(parent, child, path) AS (
  SELECT parent, child, ARRAY[parent, child]
  FROM ParentOf
    UNION
  SELECT Ancestor.parent, ParentOf.child, path || ParentOf.child
  FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
  WHERE NOT ParentOf.child = ANY (path)
)
SELECT path
FROM ancestor
WHERE parent = 'Alice'
AND child = 'Frank';

If we have a fixed root (ie “top”) node, we can hardwire it by moving WHERE parent = 'Alice' to the base case:

WITH RECURSIVE ancestor(parent, child, path) AS (
  SELECT parent, child, ARRAY[parent, child]
  FROM ParentOf
  WHERE parent = 'Alice'
    UNION
  SELECT Ancestor.parent, ParentOf.child, path || ParentOf.child
  FROM Ancestor JOIN ParentOf ON Ancestor.child = ParentOf.parent
)
SELECT path FROM ancestor
WHERE child = 'Frank';

Doing the same for a leaf (ie “bottom”) node into the base case involves reversing the direction of the transitive closure, and in turn the path order:

WITH RECURSIVE ancestor(child, parent, path) AS (
  SELECT child, parent, ARRAY[child, parent]
  FROM ParentOf
  WHERE child = 'Frank'
    UNION
  SELECT Ancestor.child, ParentOf.parent, path || ParentOf.parent
  FROM Ancestor JOIN ParentOf ON Ancestor.parent = ParentOf.child
)
SELECT path FROM ancestor
WHERE parent = 'Alice';

Or the starting “bottom” node can be provided outside the “with recursive” table:

WITH RECURSIVE ancestor(child, parent, path) AS (
  SELECT child, parent, ARRAY[child, parent]
  FROM ParentOf
    UNION
  SELECT Ancestor.child, ParentOf.parent, path || ParentOf.parent
  FROM Ancestor JOIN ParentOf ON Ancestor.parent = ParentOf.child
)
SELECT path FROM ancestor
WHERE child = 'Frank'
AND parent = 'Alice';

Hierarchy

create table Employee(ID int, salary int);
create table Manager(mID int, eID int);
create table Project(name text, mgrID int);

insert into Employee values (123, 100);
insert into Employee values (234, 90);
insert into Employee values (345, 80);
insert into Employee values (456, 70);
insert into Employee values (567, 60);
insert into Manager values (123, 234);
insert into Manager values (234, 345);
insert into Manager values (234, 456);
insert into Manager values (345, 567);
insert into Project values ('X', 123);
insert into Project values ('Y', 234);
insert into Project values ('Z', 456);

Find total salary cost of project ‘X’

Note the recursive call does not include the starting node (top project manager in this example) so needs to be added.

WITH RECURSIVE superior(mID, eID) AS (
  SELECT mID, eID
  FROM Manager
    UNION
  SELECT superior.mID, Manager.eID
  FROM superior JOIN manager ON superior.eID = manager.mID
)
SELECT sum(salary) FROM (
  SELECT ID, salary FROM Employee WHERE ID = (SELECT mgrID FROM Project WHERE name = 'X')
    UNION
  SELECT eID AS ID, salary
  FROM superior JOIN Employee ON Employee.ID = superior.eID
  WHERE mID = (SELECT mgrID FROM Project WHERE name = 'X')
);

Find total salary cost of project ‘Y’ and ‘Z’

WITH RECURSIVE superior(mID, eID) AS (
  SELECT mID, eID
  FROM Manager
    UNION
  SELECT superior.mID, Manager.eID
  FROM superior JOIN manager ON superior.eID = manager.mID
)
SELECT 'Y-total' AS project_name, sum(salary) FROM (
  SELECT ID, salary FROM Employee WHERE ID = (SELECT mgrID FROM Project WHERE name = 'Y')
    UNION
  SELECT eID AS ID, salary
  FROM superior JOIN Employee ON Employee.ID = superior.eID
  WHERE mID = (SELECT mgrID FROM Project WHERE name = 'Y')
)
UNION
SELECT 'Z-total' AS project_name, sum(salary) FROM (
  SELECT ID, salary FROM Employee WHERE ID = (SELECT mgrID FROM Project WHERE name = 'Z')
    UNION
  SELECT eID AS ID, salary
  FROM superior JOIN Employee ON Employee.ID = superior.eID
  WHERE mID = (SELECT mgrID FROM Project WHERE name = 'Z')
);

Airline flights

create table Flight(orig text, dest text, airline text, cost int);

insert into Flight values ('A', 'ORD', 'United', 200);
insert into Flight values ('ORD', 'B', 'American', 100);
insert into Flight values ('A', 'PHX', 'Southwest', 25);
insert into Flight values ('PHX', 'LAS', 'Southwest', 30);
insert into Flight values ('LAS', 'CMH', 'Frontier', 60);
insert into Flight values ('CMH', 'B', 'Frontier', 60);
insert into Flight values ('A', 'B', 'JetBlue', 195);

First find all costs

WITH RECURSIVE route(orig, dest, total, path) AS (
  SELECT orig, dest, cost, ARRAY[orig, dest]
  FROM Flight 
  UNION
  SELECT route.orig, Flight.dest, total + cost, path || Flight.dest
  FROM Flight JOIN route ON route.dest = flight.orig 
)
SELECT path, total
FROM route
WHERE orig = 'A'
AND dest = 'B'
ORDER BY total;

Guarding against cycles

insert into Flight values ('CMH', 'PHX', 'Frontier', 80);

Using Cycle Detection from the Postgresql documentation as a a template:

WITH RECURSIVE route(orig, dest, is_cycle, total, path) AS (
  SELECT orig, dest, false, cost, ARRAY[orig, dest]
  FROM Flight 
  UNION
  SELECT route.orig, Flight.dest, Flight.dest = ANY(path), total + cost, path || Flight.dest
  FROM Flight JOIN route ON route.dest = flight.orig
  WHERE NOT is_cycle
)
SELECT path, total
FROM route
WHERE orig = 'A'
AND dest = 'B'
ORDER BY total;

Rather than add the is_cycle paramater to select, which I think the document uses so as to introduce Postgresql’s “built-in syntax to simplify cycle detection”, I find it easier to make this a WHERE subquery.

WITH RECURSIVE route(orig, dest, total, path) AS (
  SELECT orig, dest, cost, ARRAY[orig, dest]
  FROM Flight 
  UNION
  SELECT route.orig, Flight.dest, total + cost, path || Flight.dest
  FROM Flight JOIN route ON route.dest = flight.orig
  WHERE NOT Flight.dest = ANY(path)
)
SELECT path, total
FROM route
WHERE orig = 'A'
AND dest = 'B'
ORDER BY total;