Algorithm or SQL : to find where conditions for a set of columns which ensures result set has value in a particular column always > 0 Algorithm or SQL : to find where conditions for a set of columns which ensures result set has value in a particular column always > 0 oracle oracle

Algorithm or SQL : to find where conditions for a set of columns which ensures result set has value in a particular column always > 0


Below is a brute-force solution. This might also be a good question for the theoretical computer science site. I think this is an NP-complete problem similar to Boolean satisfiability, but that's just a wild guess. There may be a smarter way to solve this but I don't think I'll find it.

The basic idea is to cross join every expression with every distinct value for a column, and then cross join all the columns. The table will be queried with every expression list, and a count generated for positive and negative scores. If the expression returns the expected number of positive scores and none of the negative scores it is valid.

This assumes that you are only using the expressions >, <, and =. Every new column or expression will make this problem exponentially slower.

Test schema

drop table table1;create table table1(a number, b number, c number, d number, score number);insert into table1select  1,      40,      10,       3,     -20 from dual union allselect  0,      40,      2,        3,      10 from dual union allselect  0,      10,      3,        3,      20 from dual union allselect  1,      15,      3,        3,     -5  from dual union allselect  0,      10,      2,        2,     -15 from dual union allselect  0,      15,      6,        3,     -10 from dual;

Wall of code

declare    v_inline_view varchar2(32767);    v_inline_views clob;    v_inline_view_counter number := 0;    v_and_expression varchar2(32767);    v_query clob;    v_sqls sys.odcivarchar2list;    v_dynamic_query_counter number := 0;begin    --#1: Create inline views.    --One for every combination of expression and distinct value, per column.    for inline_views in    (        --Inline view for every possible expression for each column.        select            replace(q'[            (                select *                from                (                    --Possible expressions.                    select distinct                        case                            when operator is null then null                            else ' AND %%COLUMN%% '||operator||' '||%%COLUMN%%                        end %%COLUMN%%_expression                    from                    --All operators.                    (                        select '>'  operator from dual union all                        select '<'  operator from dual union all                        select '='  operator from dual union all                        select null operator from dual                    )                    --All distinct values.                    cross join                    (                        select distinct %%COLUMN%% from table1                    )                )                --where %%COLUMN%%_expression is null or %%COLUMN%%_expression %%EXPRESSION_PERFORMANCE_EXCLUSIONS%%            )            ]', '%%COLUMN%%', column_name) inline_view        from user_tab_columns        where table_name = 'TABLE1'            and column_name <> 'SCORE'        order by column_name    ) loop        --Assign to temorary so it can be modified.        v_inline_view := inline_views.inline_view;        --#1A: Optimize inline view - throw out expressions if they don't return any positive results.        declare            v_expressions sys.odcivarchar2list;            v_expressions_to_ignore varchar2(32767);            v_has_score_gt_0 number;        begin            --Gather expressions for one column.            execute immediate v_inline_view bulk collect into v_expressions;            --Loop through and test each expression.            for i in 1 .. v_expressions.count loop                --Always keep the null expression.                if v_expressions(i) is not null then                    --Count the number of rows with a positive score.                    execute immediate 'select nvl(max(case when score > 0 then 1 else 0 end), 0) from table1 where '||replace(v_expressions(i), ' AND ', null)                    into v_has_score_gt_0;                    --If the expression returns nothing positive then add it to exclusion.                    if v_has_score_gt_0 = 0 then                        v_expressions_to_ignore := v_expressions_to_ignore||','''||v_expressions(i)||'''';                    end if;                end if;            end loop;            --Convert it into an IN clause.            if v_expressions_to_ignore is not null then                --Remove comment, replace placeholder with expression exclusions.                v_inline_view := regexp_replace(v_inline_view, '(.*)(--where)( .* )(%%EXPRESSION_PERFORMANCE_EXCLUSIONS%%)(.*)', '\1where\3 not in ('||substr(v_expressions_to_ignore, 2)||')');            end if;        end;        --Aggregate and count inline views.        if v_inline_view_counter <> 0 then            v_inline_views := v_inline_views||'cross join';        end if;        v_inline_views := v_inline_views||v_inline_view;        v_inline_view_counter := v_inline_view_counter + 1;    end loop;    --#2: Create an AND expression to combine all column expressions.    select listagg(column_name||'_expression', '||') within group (order by column_name)    into v_and_expression    from user_tab_columns    where table_name = 'TABLE1'        and column_name <> 'SCORE';    --#3: Create a that will create all possible expression combinations.    v_query :=     replace(replace(q'[        --8281 combinations        select '            select                '''||expressions||''' expression,                nvl(sum(case when score > 0 then 1 else 0 end), 0) gt_0_score_count,                nvl(sum(case when score <= 0 then 1 else 0 end), 0) le_0_score_count            from table1            where 1=1 '||expressions v_sql        from        (            --Combine expressions            select %%AND_EXPRESSION%% expressions            from            %%INLINE_VIEWS%%        ) combined_expressions    ]', '%%AND_EXPRESSION%%', v_and_expression), '%%INLINE_VIEWS%%', v_inline_views);    --TEST: It might be useful to see the full query here.    --dbms_output.put_line(v_query);    --#4: Gather expressions.    --With larger input you'll want to use a LIMIT    execute immediate v_query    bulk collect into v_sqls;    --#5: Test each expression.      --Look for any queries that return the right number of rows.    for i in 1 .. v_sqls.count loop        declare            v_expression varchar2(4000);            v_gt_0_score_count number;            v_le_0_score_count number;        begin            execute immediate v_sqls(i) into v_expression, v_gt_0_score_count, v_le_0_score_count;            v_dynamic_query_counter := v_dynamic_query_counter + 1;            --TODO: Dynamically generate "2".            if v_gt_0_score_count = 2 and v_le_0_score_count = 0 then                dbms_output.put_line('Expression: '||v_expression);            end if;        exception when others then            dbms_output.put_line('Problem with: '||v_sqls(i));        end;    end loop;    dbms_output.put_line('Queries executed: '||v_dynamic_query_counter);end;/

Results

The results appear correct. They are slightly different than yours because "columnB > 10" is incorrect.

Expression:  AND A = 0 AND C < 6 AND D = 3Expression:  AND A = 0 AND C < 6 AND D > 2Expression:  AND A < 1 AND C < 6 AND D = 3Expression:  AND A < 1 AND C < 6 AND D > 2Queries executed: 441

Problems

This brute-force approach is extremely inefficient in at least two ways. Even for this simple example it requires 6370 queries.. And the results may include duplicates that are non-trivial to reduce. Or perhaps you'll get lucky and there are so few solutions that you can eyeball them.

There are a few things you can do to improve query performance. The easiest one would be to check every condition individually and throw it out if it does not "gain" anything for the counts.

Optimizations

Individual expressions that do not return any positive results are excluded. With the sample data, this reduces the number of query executions from 6370 to 441.

Running the process in parallel may also improve the performance by an order of magnitude. It would probably require a parallel pipelined function.

But even a 100x performance improvement may not help with an NP-complete problem. You may need to find some additional "short cuts" based on your sample data.

It may help to print out the query that generates the test queries, by un-commenting one of the dbms_output.put_line statements. Add a count(*) to see how many queries will be executed and run with a smaller data set to make an estimate for how long it will take.

If the estimate is a billion years, and you can't think of any tricks to make the brute-force method work faster, it may be time to ask this question on https://cstheory.stackexchange.com/


The idea of the solution is that the columns are independent. So it can be solved column by column.

So you can imagine that you search and build something in multidimensional space. Each column represents a dimension, having values from -inf to +inf. And build the solution dimension by dimension.

  • For the 1st column the solution is: A=1 => false, A=0 => true.
  • Then you add 2nd dimension B. You have 5 values, so dimension on column B be is divided into 6 intervals. Some of the consecutive intervals can be joined. For example <10, 50> and <50,inf> do both imply true.
  • And then you add 3rd dimension.
  • ...

If you want to join dimension intervals on SQL level you can use LAG function. By using partitioning and windowing you order rows by one column. Then you compute a value true/false in a cooked column. And in the next cooked column by using the LAG function you detect whether the true/false flag did change from the previous row.

create table test(  b number,  s number);insert into test values(10, -20);insert into test values(50,  10);insert into test values(15,  20);insert into test values(18,  5);select u.*, case when LAG (b, 1, null) OVER (ORDER BY b) = b then 'Y' else 'N' end same_b_value, LAG (score_flag, 1, null) OVER (ORDER BY b) AS score_flag_prev, case when LAG (score_flag, 1, null) OVER (ORDER BY b) <> score_flag then 'Y' else 'N' end score_flag_changedfrom(  select t.*,  case when t.s >= 0 then 'Y' else 'N' end as score_flag  from test t)  uorder by b asc;

This query will show that value B=15 is significant because it is where the score_flag changes.

I'm not sure about value B=10 in the question. As this one is linked to both positive and negative score values. Should it be included or excluded then?


Very interesting problem. My proposition bases on function check_column, code below. Execution examples:

select CHECK_COLUMN('col01') from dual;    => "COL01 = 0"select CHECK_COLUMN('col03') from dual;    => "COL03 <= 2"select column_name cn, check_column(column_name) crit   from all_tab_columns atc        where atc.table_name='SCORES' and column_name like 'COL%';cn          critCOL01       COL01 = 0COL02       COL02 >= 32  COL03       COL03 <= 2COL04       COL04 = COL04

In your example, row 3, columnB I replaced value 10 with 32, because example was not good, and condition and columnB >10 was not right. Col04 is only for presentation, as it's neutral. You need to stick output strings together in java or sql, but that shouldn't be a problem.

I named base table as scores, then created view positives, you can instead of view put data in some temporary table, execution should be much faster.

create or replace view positives as  select distinct col01, col02, col03, col04     from scores where score>0  minus select COL01,COL02,COL03,COL04    from scores where score<0;

Function is:

create or replace function check_column(i_col in varchar2) return varchar2 as  v_tmp number;  v_cnt number;  v_ret varchar2(4000);begin  -- candidate for neutral column ?  execute immediate 'select count(distinct '||i_col||') from positives' into v_tmp;  execute immediate 'select count(distinct '||i_col||') from scores' into v_cnt;  if v_tmp = v_cnt then    return i_col||' = '||i_col;   -- empty string is better, this is for presentation   end if;  -- candidate for "column = some_value" ?  execute immediate 'select count(distinct '||i_col||') from positives' into v_cnt;  if v_cnt = 1 then    execute immediate 'select distinct '||i_col||' from positives' into v_tmp;    return i_col||' = '||v_tmp;   end if;  -- is this candidate for "column >= some_value" ?  execute immediate 'select min(distinct '||i_col||') from positives' into v_tmp;  execute immediate 'select count(1) from scores where '||i_col||    ' not in (select '||i_col||' from positives) and '||i_col||'>'||v_tmp into v_cnt;  if v_cnt = 0 then    execute immediate 'select min('||i_col||') from scores' into v_cnt;    if v_cnt != v_tmp then       return i_col||' >= '||v_tmp;     end if;  end if;  -- is this candidate for "column <= some_value" ?  execute immediate 'select max(distinct '||i_col||') from positives' into v_tmp;  execute immediate 'select count(1) from scores where '||i_col||    ' not in (select '||i_col||' from positives) and '||i_col||'<'||v_tmp into v_cnt;  if v_cnt = 0 then     execute immediate 'select max('||i_col||') from scores' into v_cnt;    if v_cnt != v_tmp then      return i_col||' <= '||v_tmp;    end if;  end if;  -- none of the above, have to list specific values  execute immediate 'select listagg('||i_col||', '', '') '    ||'within group (order by '||i_col||') '    ||'from (select distinct '||i_col||' from positives)' into v_ret;  return i_col||' in ('||v_ret||')';end check_column;

This solution is nor optimized nor heavily tested, please be careful.If you have Oracle version < 11 replace listagg with wmsys.wm_concat.