Converting a Google search query to a PostgreSQL "tsquery" Converting a Google search query to a PostgreSQL "tsquery" postgresql postgresql

Converting a Google search query to a PostgreSQL "tsquery"


Honest, I think regular expressions are the way to go with something like this. Just the same, this was a fun exercise. The code below is very prototypal - in fact, you'll see that I didn't even implement the lexer itself - I just faked the output. I'd like to continue it but I just don't have more spare time today.

Also, there definitely a lot more work to be done here in terms of supporting other types of search operators and the like.

Basically, the idea is that a certain type of query is lexed then parsed into a common format (in this case, a QueryExpression instance) which is then rendered back out as another type of query.

<?phpini_set( "display_errors", "on" );error_reporting( E_ALL );interface ILexer{    public function execute( $str );    public function getTokens();}interface IParser{    public function __construct( iLexer $lexer );    public function parse( $input );    public function addToken( $token );}class GoogleQueryLexer implements ILexer{    private $tokenStack = array();    public function execute( $str )    {        $chars = str_split( $str );        foreach ( $chars as $char )        {            //  add to self::$tokenStack per your rules        }        //'("used cars" OR "new cars") -ford -mistubishi'        $this->tokenStack = array(                '('            ,   'used cars'            ,   'or new cars'            ,   ')'            ,   '-ford'            ,   '-mitsubishi'        );    }    public function getTokens()    {        return $this->tokenStack;    }}class GoogleQueryParser implements IParser{    protected $lexer;    public function __construct( iLexer $lexer )    {        $this->lexer = $lexer;    }    public function addToken( $token )    {        $this->tokenStack[] = $token;    }    public function parse( $input )    {        $this->lexer->execute( $input );        $tokens = $this->lexer->getTokens();        $expression = new QueryExpression();        foreach ( $tokens as $token )        {            $expression = $this->processToken( $token, $expression );        }        return $expression;    }    protected function processToken( $token, QueryExpression $expression )    {        switch ( $token )        {            case '(':                return $expression->initiateSubExpression();                break;            case ')':                return $expression->getParentExpression();                break;            default:                $modifier   = $token[0];                $phrase     = substr( $token, 1 );                switch ( $modifier )                {                    case '-':                        $expression->addExclusionPhrase( $phrase );                        break;                    case '+':                        $expression->addPhrase( $phrase );                        break;                    default:                        $operator   = trim( substr( $token, 0, strpos( $token, ' ' ) ) );                        $phrase     = trim( substr( $token, strpos( $token, ' ' ) ) );                        switch ( strtolower( $operator ) )                        {                            case 'and':                                $expression->addAndPhrase( $phrase );                                break;                            case 'or':                                $expression->addOrPhrase( $phrase );                                break;                            default:                                $expression->addPhrase( $token );                        }                }        }        return $expression;    }}class QueryExpression{    protected $phrases = array();    protected $subExpressions = array();    protected $parent;    public function __construct( $parent=null )    {        $this->parent = $parent;    }    public function initiateSubExpression()    {        $expression = new self( $this );        $this->subExpressions[] = $expression;        return $expression;    }    public function getPhrases()    {        return $this->phrases;    }    public function getSubExpressions()    {        return $this->subExpressions;    }    public function getParentExpression()    {        return $this->parent;    }    protected function addQueryPhrase( QueryPhrase $phrase )    {        $this->phrases[] = $phrase;    }    public function addPhrase( $input )    {        $this->addQueryPhrase( new QueryPhrase( $input ) );    }    public function addOrPhrase( $input )    {        $this->addQueryPhrase( new QueryPhrase( $input, QueryPhrase::MODE_OR ) );    }    public function addAndPhrase( $input )    {        $this->addQueryPhrase( new QueryPhrase( $input, QueryPhrase::MODE_AND ) );    }    public function addExclusionPhrase( $input )    {        $this->addQueryPhrase( new QueryPhrase( $input, QueryPhrase::MODE_EXCLUDE ) );    }}class QueryPhrase{    const MODE_DEFAULT = 1;    const MODE_OR = 2;    const MODE_AND = 3;    const MODE_EXCLUDE = 4;    protected $phrase;    protected $mode;    public function __construct( $input, $mode=self::MODE_DEFAULT )    {        $this->phrase = $input;        $this->mode = $mode;    }    public function getMode()    {        return $this->mode;    }    public function __toString()    {        return $this->phrase;    }}class TsqueryBuilder{    protected $expression;    protected $query;    public function __construct( QueryExpression $expression )    {        $this->query = trim( $this->processExpression( $expression ), ' &|' );    }    public function getResult()    {        return $this->query;    }    protected function processExpression( QueryExpression $expression )    {        $query = '';        $phrases = $expression->getPhrases();        $subExpressions = $expression->getSubExpressions();        foreach ( $phrases as $phrase )        {            $format = "'%s' ";            switch ( $phrase->getMode() )            {                case QueryPhrase::MODE_AND :                    $format = "& '%s' ";                    break;                case QueryPhrase::MODE_OR :                    $format = "| '%s' ";                    break;                case QueryPhrase::MODE_EXCLUDE :                    $format = "& !'%s' ";                    break;            }            $query .= sprintf( $format, str_replace( "'", "\\'", $phrase ) );        }        foreach ( $subExpressions as $subExpression )        {            $query .= "& (" . $this->processExpression( $subExpression ) . ")";        }        return $query;    }}$parser = new GoogleQueryParser( new GoogleQueryLexer() );$queryBuilder = new TsqueryBuilder( $parser->parse( '("used cars" OR "new cars") -ford -mistubishi' ) );echo $queryBuilder->getResult();