How to insert data into hive table from arrays returned by XPath How to insert data into hive table from arrays returned by XPath powershell powershell

How to insert data into hive table from arrays returned by XPath


xpath_... (str,concat('/tag/row[',pe.pos+1,']/@...))

create table hivexml (str string);insert into hivexml values ('<tag><row Id="1" TagName=".net" Count="244006" ExcerptPostId="3624959" WikiPostId="3607476" /><row Id="2" TagName="html" Count="602809" ExcerptPostId="3673183" WikiPostId="3673182" /><row Id="3" TagName="javascript" Count="1274350" ExcerptPostId="3624960" WikiPostId="3607052" /><row Id="4" TagName="css" Count="434937" ExcerptPostId="3644670" WikiPostId="3644669" /><row Id="5" TagName="php" Count="1009113" ExcerptPostId="3624936" WikiPostId="3607050" /><row Id="8" TagName="c" Count="236386" ExcerptPostId="3624961" WikiPostId="3607013" /></tag>');

select  xpath_int    (str,concat('/tag/row[',pe.pos+1,']/@Id'           )) as Id         ,xpath_string (str,concat('/tag/row[',pe.pos+1,']/@TagName'      )) as TagName       ,xpath_int    (str,concat('/tag/row[',pe.pos+1,']/@Count'        )) as Count       ,xpath_int    (str,concat('/tag/row[',pe.pos+1,']/@ExcerptPostId')) as ExcerptPostId       ,xpath_int    (str,concat('/tag/row[',pe.pos+1,']/@WikiPostId'   )) as WikiPostIdfrom    hivexml        lateral view  posexplode (xpath(str,'/tag/row/@Id')) pe;

+----+------------+---------+---------------+------------+| id |  tagname   |  count  | excerptpostid | wikipostid |+----+------------+---------+---------------+------------+|  1 | .net       |  244006 |       3624959 |    3607476 ||  2 | html       |  602809 |       3673183 |    3673182 ||  3 | javascript | 1274350 |       3624960 |    3607052 ||  4 | css        |  434937 |       3644670 |    3644669 ||  5 | php        | 1009113 |       3624936 |    3607050 ||  8 | c          |  236386 |       3624961 |    3607013 |+----+------------+---------+---------------+------------+


xpath (str,concat('/tag/row[',pe.pos+1,']/@*'))

This is a quite clean way to extract all the values of an element together.
What surprised me here it that the order of the attributes does not seem to be according to their order within the XML but in alphabetical order by their names -
@Count,@ExcerptPostId,@Id,@TagName,@WikiPostId

Unfortunately, I can't consider it as a legitimate solution unless I know that the alphabetical attributes order is guaranteed.

select  xpath (str,concat('/tag/row[',pe.pos+1,']/@*')) as row_valuesfrom    hivexml        lateral view  posexplode (xpath(str,'/tag/row/@Id')) pe;

--

["244006","3624959","1",".net","3607476"]["602809","3673183","2","html","3673182"]["1274350","3624960","3","javascript","3607052"]["434937","3644670","4","css","3644669"]["1009113","3624936","5","php","3607050"]["236386","3624961","8","c","3607013"]

select  row_values[2] as Id       ,row_values[3] as TagName       ,row_values[0] as Count           ,row_values[1] as ExcerptPostId       ,row_values[4] as WikiPostIdfrom   (select  xpath (str,concat('/tag/row[',pe.pos+1,']/@*')) as row_values        from    hivexml                lateral view  posexplode (xpath(str,'/tag/row/@Id')) pe        ) x;

+----+------------+---------+---------------+------------+| id |  tagname   |  count  | excerptpostid | wikipostid |+----+------------+---------+---------------+------------+|  1 | .net       |  244006 |       3624959 |    3607476 ||  2 | html       |  602809 |       3673183 |    3673182 ||  3 | javascript | 1274350 |       3624960 |    3607052 ||  4 | css        |  434937 |       3644670 |    3644669 ||  5 | php        | 1009113 |       3624936 |    3607050 ||  8 | c          |  236386 |       3624961 |    3607013 |+----+------------+---------+---------------+------------+


split + str_to_map

select  vals["Id"]              as Id       ,vals["TagName"]         as TagName       ,vals["Count"]           as Count           ,vals["ExcerptPostId"]   as ExcerptPostId       ,vals["WikiPostId"]      as WikiPostIdfrom   (select  str_to_map(e.val,' ','=') as vals        from    hivexml                 lateral view  posexplode(split(translate(str,'"',''),'/?><row')) e        where   e.pos <> 0        ) x;

+----+------------+---------+---------------+------------+| id |  tagname   |  count  | excerptpostid | wikipostid |+----+------------+---------+---------------+------------+|  1 | .net       |  244006 |       3624959 |    3607476 ||  2 | html       |  602809 |       3673183 |    3673182 ||  3 | javascript | 1274350 |       3624960 |    3607052 ||  4 | css        |  434937 |       3644670 |    3644669 ||  5 | php        | 1009113 |       3624936 |    3607050 ||  8 | c          |  236386 |       3624961 |    3607013 |+----+------------+---------+---------------+------------+