Thursday, March 15, 2012

Database Development - Data Omission 1


ALTER PROCEDURE dbo.usp_blg_vkj_0000002 AS
BEGIN

/* Here is an example of omitting data by linking two outer or inner joined tables
to each other instead of a root containing all possible keys for both tables.
This example also introduces a limitation of SQL as joining more than one table
on the same columns where one of the source record sets contains more than one record
per join criteria causes row duplication to occur. */

-------------------------------------------------------------------------
-- Create schema.
-------------------------------------------------------------------------
IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name = N'BLOG2')
 EXEC sys.sp_executesql N'CREATE SCHEMA [BLOG2] AUTHORIZATION [dbo]'

-------------------------------------------------------------------------
-- Create example tables.
-------------------------------------------------------------------------
IF NOT OBJECT_ID('BLOG2.Entity') IS NULL DROP TABLE BLOG2.Entity
CREATE TABLE BLOG2.Entity(EntityID integer, EntityName varchar(50))

-- populate Entity with 3 records.
INSERT INTO BLOG2.Entity(EntityID, EntityName )
SELECT EntityID = 1, EntityName = 'Entity1'
UNION SELECT EntityID = 2, EntityName = 'Entity2'
UNION SELECT EntityID = 3, EntityName = 'Entity3'

-- Create EntityBase1 and populate it with 3 records.
IF NOT OBJECT_ID('BLOG2.EntityBase1') IS NULL DROP TABLE BLOG2.EntityBase1
CREATE TABLE BLOG2.EntityBase1 (EntityID integer, Period integer)
INSERT INTO BLOG2.EntityBase1 (EntityID,Period )
SELECT Entity = 1, Period = 201101
UNION SELECT Entity = 1, Period = 201102

-- Create EntityBase2 and populate it with 3 records.
IF NOT OBJECT_ID('BLOG2.EntityBase2') IS NULL DROP TABLE BLOG2.EntityBase2
CREATE TABLE BLOG2.EntityBase2 (EntityID integer, FromPeriod integer, ThroughPeriod integer)
INSERT INTO BLOG2.EntityBase2 (EntityID,FromPeriod, ThroughPeriod )
SELECT Entity = 1, FromPeriod = 201101, ThroughPeriod = 201103
UNION SELECT Entity = 2, FromPeriod = 201104, ThroughPeriod = 201107

-- Create the 1st table for monetary data.
IF NOT OBJECT_ID('BLOG2.EntityDetail1') IS NULL DROP TABLE EntityDetail1
CREATE TABLE BLOG2.EntityDetail1 (EntityID integer, period integer, Amount money)

-- insert 3 records
INSERT INTO BLOG2.EntityDetail1 (EntityID, period, Amount)
SELECT BaseTable.EntityID , BaseTable.period , BaseTable.Amount
FROM (
SELECT EntityID = 1, period = 201101, Amount = 50
UNION ALL SELECT EntityID = 1, period = 201101, Amount = 60
UNION ALL SELECT EntityID = 1, period = 201101, Amount = 70
) BaseTable

-- Create the 2nd table for monetary data.
IF NOT OBJECT_ID('BLOG2.EntityDetail2') IS NULL DROP TABLE BLOG2.EntityDetail2
CREATE TABLE BLOG2.EntityDetail2(EntityID integer, period integer, Amount money)

-- insert 3 monetary records
INSERT INTO BLOG2.EntityDetail2(EntityID, period, Amount)
SELECT BaseTable.EntityID , BaseTable.period , BaseTable.Amount
FROM (
SELECT EntityID = 1, period = 201101, Amount = 50
UNION ALL SELECT EntityID = 1, period = 201101, Amount = 40
UNION ALL SELECT EntityID = 1, period = 201102, Amount = 30
) BaseTable
-------------------------------------------------------------------------
-- End of DML
-------------------------------------------------------------------------

/*
SELECT * FROM BLOG2.EntityBase1 BaseTable

EntityID    Period
----------- -----------
1           201101
1           201102
(2 row(s) affected)

SELECT * FROM BLOG2.EntityDetail1 BaseTable

EntityID    period      Amount
----------- ----------- ---------------------
1           201101      50.00
1           201101      60.00
1           201101      70.00
(3 row(s) affected)


SELECT * FROM BLOG2.EntityDetail2 BaseTable

EntityID    period      Amount
----------- ----------- ---------------------
1           201101      50.00
1           201101      40.00
1           201102      30.00
(3 row(s) affected)

The next example will have a join causing omission of data. The join

        LEFT JOIN BLOG2.EntityDetail2 Join2 WITH (NOLOCK)
             ON Join1.EntityID = Join2.EntityID
            AND Join1.Period = Join2.Period

should actually be coded to link to the BaseTable as

      LEFT JOIN BLOG2.EntityDetail2 Join2 WITH (NOLOCK)
           ON BaseTable.EntityID = Join2.EntityID
           AND BaseTable.Period = Join2.Period

although that is not entirely correct as I have introduced an inherent SQL limitation in this example.

Here is our data.

--BLOG2.EntityBase1 BaseTable
EntityID Period
----------- -----------
1 201101
1 201102

--BLOG2.EntityDetail1 BaseTable
EntityID period Amount
----------- ----------- ---------------------
1 201101 50.00
1 201101 60.00
1 201101 70.00

--BLOG2.EntityDetail2 BaseTable
EntityID period Amount
----------- ----------- ---------------------
1 201101 50.00
1 201101 40.00
1 201102 30.00

Our EntityBase1 table only contains two records.
One for 201101 and one for 201102. Both for EntityId = 1
Our EntityDetail1 table contains 3 records for EntityId = 1. All three for period 201101.
Our EntityDetail2 table contains 3 records, 2 for 201101 and 1 for 201102.
In total we have 5 records for 201101 and 1 record for 201102 so in total we should have at most 6 records.
Due to invalid joins in the queries below we have 7 records returning however the 201102 data is not returned at all in the first query.
*/

SELECT
BaseTable.EntityID
, Period = BaseTable.Period
, Period1 =Join1.Period
, Join1Amount = Join1.Amount
, Period2 = Join2.Period
, Join2Amount = Join2.Amount
FROM BLOG2.EntityBase1 BaseTable WITH (NOLOCK)
LEFT JOIN BLOG2.EntityDetail1 Join1 WITH (NOLOCK)
                ON BaseTable.EntityID = Join1.EntityID
                AND BaseTable.Period = Join1.Period
LEFT JOIN BLOG2.EntityDetail2 Join2 WITH (NOLOCK)
             ON Join1.EntityID = Join2.EntityID
            AND Join1.Period = Join2.Period

/*
As you can see the values returned by this query do not reflect the actual data.

EntityID    Period      Period1     Join1Amount           Period2     Join2Amount
----------- ----------- ----------- --------------------- ----------- ---------------------
1           201101      201101      50.00                 201101      50.00
1           201101      201101      50.00                 201101      40.00
1           201101      201101      60.00                 201101      50.00
1           201101      201101      60.00                 201101      40.00
1           201101      201101      70.00                 201101      50.00
1           201101      201101      70.00                 201101      40.00
1           201102      NULL        NULL                  NULL        NULL

(7 row(s) affected)

Now lets try to query the data a bit differently. This example will fix one thing wrong with the
prior query but will leave one more thing to be fixed. In this example a base record set containing
all possible keys (EntityID,Period) for both tables is created. These values are then linked to the
tables containing the actual data. There is still a problem with this query. As discussed in the last
example both EntityDetail1 and EntityDetail2 need to link on the same columns (EntityID,Period) which causes row duplication when either join returns more than one record and has identical join criteria.
In the following query a fix to join2 has been implemented as a join on the BaseTable instead of join1.  Our 201102 data is now showing up however this is still not correct but let's see what the data is returned anyway. */

SELECT
  BaseTable.EntityID
, AllPossible.Period
, AmountBaseTable = Join1.Amount
, AmountJoin1 = Join2.Amount
FROM BLOG2.EntityBase1 BaseTable WITH (NOLOCK)
-- All possible
LEFT JOIN (
    SELECT EntityID , Period
    FROM BLOG2.EntityDetail1
    UNION -- Don't use UNION ALL as we want a distinct list to join on.
    SELECT EntityID , Period 
    FROM BLOG2.EntityDetail2
) AllPossible
   ON BaseTable.EntityID = AllPossible.EntityID
   AND BaseTable.Period = AllPossible.Period
-- Detail 1
LEFT JOIN BLOG2.EntityDetail1 Join1 WITH (NOLOCK)
   ON AllPossible.EntityID = Join1.EntityID
   AND AllPossible.Period = Join1.Period
-- Detail 2
LEFT JOIN BLOG2.EntityDetail2 Join2 WITH (NOLOCK)
   ON AllPossible.EntityID = Join2.EntityID
   AND AllPossible.Period = Join2.Period


/* As can be seen we still have row duplication issues.

EntityID    Period      AmountBaseTable       AmountJoin1
----------- ----------- --------------------- ---------------------
1           201101      50.00                 50.00
1           201101      50.00                 40.00
1           201101      60.00                 50.00
1           201101      60.00                 40.00
1           201101      70.00                 50.00
1           201101      70.00                 40.00
1           201102      NULL                  30.00
(7 row(s) affected)

In this example all possible records have been rolled up into 'AllPossible'
and the tables containing the monetary amounts have been formatted as one table
by using the UNION ALL operator. We need to make sure to use the UNION ALL operator as we don't want to lose any data. A flag is supplied to denote which table the
values come from.
*/

SELECT
   BaseTable.EntityID
 , AllPossible.Period
 , Join2.RecType
 , Join2.Amount
FROM BLOG2.EntityBase1 BaseTable WITH (NOLOCK)
 LEFT JOIN (
  SELECT
   EntityID
   , Period
  FROM BLOG2.EntityDetail1
  UNION
  SELECT
   EntityID
   , Period
  FROM BLOG2.EntityDetail2
 ) AllPossible
  ON BaseTable.EntityID = AllPossible.EntityID
  AND BaseTable.Period = AllPossible.Period

 LEFT JOIN (
  SELECT
   RecType = 1
   , *
  FROM BLOG2.EntityDetail1
  UNION ALL
  SELECT
   RecType = 2
   , *
  FROM BLOG2.EntityDetail2
 ) Join2
  ON AllPossible.EntityID = Join2.EntityID
  AND AllPossible.Period = Join2.Period

/*
We now have only 6 records returned and they all display the correct values.

EntityID    Period      RecType     Amount
----------- ----------- ----------- ---------------------
1           201101      1           50.00
1           201101      1           60.00
1           201101      1           70.00
1           201101      2           50.00
1           201101      2           40.00
1           201102      2           30.00
(6 row(s) affected)
Here is another version of the prior query. In this example we sum the values and display them side by side.
*/

SELECT
      BaseTable.EntityID
     , AllPossible.Period
     , AmountBaseTable = SUM(CASE WHEN Join2.RecType = 1 THEN Join2.Amount ELSE 0 END)
     , AmountJoin1 = SUM(CASE WHEN Join2.RecType = 2 THEN Join2.Amount ELSE 0 END)
FROM BLOG2.EntityBase1 BaseTable WITH (NOLOCK)
 LEFT JOIN (
  SELECT
   EntityID
   , Period
  FROM BLOG2.EntityDetail1
  UNION
  SELECT
   EntityID
   , Period
  FROM BLOG2.EntityDetail2
 ) AllPossible
  ON BaseTable.EntityID = AllPossible.EntityID
  AND BaseTable.Period = AllPossible.Period

 LEFT JOIN (
  SELECT
   RecType = 1
   , *
  FROM BLOG2.EntityDetail1
  UNION ALL
  SELECT
   RecType = 2
   , *
  FROM BLOG2.EntityDetail2
 ) Join2
  ON AllPossible.EntityID = Join2.EntityID
  AND AllPossible.Period = Join2.Period
GROUP BY BaseTable.EntityID , AllPossible.Period

/*
EntityID    Period      AmountBaseTable       AmountJoin1
----------- ----------- --------------------- ---------------------
1           201101      180.00                90.00
1           201102      0.00                  30.00

(2 row(s) affected)

Hopefully the examples I have provided will help the reader understand
how row duplication occurs in SQL statements and how to deal with
identical join criteria.  If both EntityDetail tables had unique record
constraints on (entityid,period) they could have have been linked to
AllPossible without causing row duplication.
*/

END
GO
-- Another Example.
-------------------------------------------------------------------------------
-- Here is an example of ommiting data by linked two outer or inner joined tables
-- to each other instead of a root containing all possible keys for both tables.
-------------------------------------------------------------------------------
SELECT  t1.InvestorID , t2.Period , t3.Period
FROM     dbo.InvestorBase t1 WITH (NOLOCK)
        LEFT JOIN dbo.InvestorDetail1 t2 WITH (NOLOCK) ON t1.InvestorID = t2.InvestorID 
        LEFT JOIN dbo.InvestorDetail2 t3 WITH (NOLOCK)  ON t2.InvestorID = t3.InvestorID 
                                                                                    AND t2.Period = t3.Period
                     

SELECT  t1.InvestorID , AllPossible.Period, Amount1 = t2.Amount, Amount2 = t3.Amount
FROM     dbo.InvestorBase t1 WITH (NOLOCK)
            LEFT JOIN (
                  SELECT InvestorID , Period  FROM dbo.InvestorDetail1
                  UNION
                  SELECT InvestorID , Period  FROM dbo.InvestorDetail2
            ) AllPossible
                  ON t1.InvestorID  = AllPossible.InvestorID 
                  AND t1.Period = AllPossible.Period

            LEFT JOIN dbo.InvestorDetail1 t2 WITH (NOLOCK) ON AllPossible.InvestorID = t2.InvestorID 
                                                                                    AND AllPossible.Period = t2.Period
            LEFT JOIN dbo.InvestorDetail2 t3 WITH (NOLOCK)  ON AllPossible.InvestorID = t3.InvestorID 
                                                                                    AND AllPossible.Period = t3.Period

No comments:

Post a Comment