How to query struct array with Hive (get_json_object)? How to query struct array with Hive (get_json_object)? json json

How to query struct array with Hive (get_json_object)?


I found a solution for this:

Use the Hive explode UDTF to explode the struct array, i.e., create a second (temporary) table with one record for each struct in the array "features".

CREATE TABLE tbl_exploded asselect main_id, f.name as f_name,f.value as f_valuefrom tblLATERAL VIEW explode(features) exploded_table as f-- optionally filter here instead of in 2nd query:-- where f.name = 'baz'; 

The result of this is:

qwert, foo, ab12345qwert, bar, cd67890qwert, baz, ["A","B","C"]

Now you can select the main_id and value like this:

select main_id, f_value from tbl_exploded where f_name = 'baz';


This one could be ok.

ParseJsonWithPath

ADD JAR your-path/ParseJsonWithPath.jar;CREATE TEMPORARY FUNCTION parseJsonWithPath AS 'com.ntc.hive.udf.ParseJsonWithPath';SELECT parseJsonWithPath(jsonStr, xpath) FROM ....

The field to be parsed can be a json string(jsonStr), given the xpath, you can get what you want.

For example

jsonStr{ "book": [    {        "category": "reference",        "author": "Nigel Rees",        "title": "Sayings of the Century",        "price": 8.95    },    {        "category": "fiction",        "author": "Evelyn Waugh",        "title": "Sword of Honour",        "price": 12.99   }}xpath"$.book"         return the insider json string [....]"$.book[?(@.price < 10)]"         return the [8.95]

more detail


The UDF pasted below I think is close to your needs. It takes array<struct>, a string, and an integer. String is the field name, in your case "name", and the third argument is the value to match on. Currently it expects an integer but it should be relatively easy to change this to string / text for your purpose.

import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.lazy.LazyString;import org.apache.hadoop.hive.serde2.lazy.LazyLong;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantIntObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.LongWritable;import java.util.ArrayList;import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;@Description(name = "extract_value",    value = "_FUNC_( array< struct<value:string> > ) - Collect all \"value\" field values inside an array of struct(s), and return the results in an array<string>",    extended = "Example:\n SELECT _FUNC_(array_of_structs_with_value_field)")public class StructFromArrayStructDynamicInt        extends GenericUDF{    private ArrayList ret;    private ListObjectInspector listOI;    private StructObjectInspector structOI;    private ObjectInspector indOI;    private ObjectInspector valOI;    private ObjectInspector arg1OI;    private ObjectInspector arg2OI;    private String indexName;    WritableConstantStringObjectInspector element1OI;    WritableConstantIntObjectInspector element2OI;    @Override    public ObjectInspector initialize(ObjectInspector[] args)            throws UDFArgumentException    {        if (args.length != 3) {            throw new UDFArgumentLengthException("The function extract_value() requires exactly three arguments.");        }        if (args[0].getCategory() != Category.LIST) {            throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_value but " + args[0].getTypeName() + " is found instead");        }        if (args[1].getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[1].getTypeName() + " is found instead");        }        if (args[2].getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[2].getTypeName() + " is found instead");        }        listOI = ((ListObjectInspector) args[0]);        structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());        arg1OI = (StringObjectInspector) args[1];        arg2OI = args[2];        this.element1OI = (WritableConstantStringObjectInspector) arg1OI;        this.element2OI = (WritableConstantIntObjectInspector) arg2OI;        indexName = element1OI.getWritableConstantValue().toString();//        if (structOI.getAllStructFieldRefs().size() != 2) {//            throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");//        }//        StructField valueField = structOI.getStructFieldRef("value");        StructField indexField = structOI.getStructFieldRef(indexName);        //If not, throw exception//        if (valueField == null) {//            throw new UDFArgumentTypeException(0, "NO \"value\" field in input structure");//        }        if (indexField == null) {            throw new UDFArgumentTypeException(0, "Index field not in input structure");        }        //Are they of the correct types?        //We store these object inspectors for use in the evaluate() method//        valOI = valueField.getFieldObjectInspector();        indOI = indexField.getFieldObjectInspector();        //First are they primitives//        if (valOI.getCategory() != Category.PRIMITIVE) {//            throw new UDFArgumentTypeException(0, "value field must be of primitive type");//        }        if (indOI.getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "index field must be of primitive type");        }        if (arg1OI.getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "second argument must be primitive type");        }        if (arg2OI.getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "third argument must be primitive type");        }        //Are they of the correct primitives?//        if (((PrimitiveObjectInspector)valOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {//            throw new UDFArgumentTypeException(0, "value field must be of string type");//        }        if (((PrimitiveObjectInspector)indOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.LONG) {            throw new UDFArgumentTypeException(0, "index field must be of long type");        }        if (((PrimitiveObjectInspector)arg1OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {            throw new UDFArgumentTypeException(0, "second arg must be of string type");        }        if (((PrimitiveObjectInspector)arg2OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) {            throw new UDFArgumentTypeException(0, "third arg must be of int type");        }//        ret = new ArrayList();        return listOI.getListElementObjectInspector();//        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;//        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);    }    @Override    public Object evaluate(DeferredObject[] arguments)            throws HiveException    {//        ret.clear();        if (arguments.length != 3) {            return null;        }        if (arguments[0].get() == null) {        return null;        }        int numElements = listOI.getListLength(arguments[0].get());//        long xl = argOI.getPrimitiveJavaObject(arguments[1].get());//        long xl = arguments[1].get(); //9;        long xl2 = element2OI.get(arguments[2].get());//        String xl1 = element1OI.getPrimitiveJavaObject(arguments[2].get());//        long xl = 9;        for (int i = 0; i < numElements; i++) {//            LazyString valDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("value")));            long indValue = (Long) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef(indexName)));//            throw new HiveException("second arg must be of string type");//            LazyString indDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("index")));//            Text valueValue = ((StringObjectInspector) valOI).getPrimitiveWritableObject(valDataObject);//            LongWritable indValue = ((LazyLongObjectInspector) indOI).getPrimitiveWritableObject(indDataObject);            if(indValue == xl2) {                return listOI.getListElement(arguments[0].get(), i);            }//            ret.add(valueValue);       }        return null;    }    @Override    public String getDisplayString(String[] strings) {        assert (strings.length > 0);        StringBuilder sb = new StringBuilder();        sb.append("extract_value(");        sb.append(strings[0]);        sb.append(")");        return sb.toString();    }}

Here is the code for this and a couple other working udfs that do things with array<struct>.