How to insert data into hive table from arrays returned by XPath
xpath_... (str,concat('/tag/row[',pe.pos+1,']/@...))
create table hivexml (str string);insert into hivexml values ('<tag><row Id="1" TagName=".net" Count="244006" ExcerptPostId="3624959" WikiPostId="3607476" /><row Id="2" TagName="html" Count="602809" ExcerptPostId="3673183" WikiPostId="3673182" /><row Id="3" TagName="javascript" Count="1274350" ExcerptPostId="3624960" WikiPostId="3607052" /><row Id="4" TagName="css" Count="434937" ExcerptPostId="3644670" WikiPostId="3644669" /><row Id="5" TagName="php" Count="1009113" ExcerptPostId="3624936" WikiPostId="3607050" /><row Id="8" TagName="c" Count="236386" ExcerptPostId="3624961" WikiPostId="3607013" /></tag>');
select xpath_int (str,concat('/tag/row[',pe.pos+1,']/@Id' )) as Id ,xpath_string (str,concat('/tag/row[',pe.pos+1,']/@TagName' )) as TagName ,xpath_int (str,concat('/tag/row[',pe.pos+1,']/@Count' )) as Count ,xpath_int (str,concat('/tag/row[',pe.pos+1,']/@ExcerptPostId')) as ExcerptPostId ,xpath_int (str,concat('/tag/row[',pe.pos+1,']/@WikiPostId' )) as WikiPostIdfrom hivexml lateral view posexplode (xpath(str,'/tag/row/@Id')) pe;
+----+------------+---------+---------------+------------+| id | tagname | count | excerptpostid | wikipostid |+----+------------+---------+---------------+------------+| 1 | .net | 244006 | 3624959 | 3607476 || 2 | html | 602809 | 3673183 | 3673182 || 3 | javascript | 1274350 | 3624960 | 3607052 || 4 | css | 434937 | 3644670 | 3644669 || 5 | php | 1009113 | 3624936 | 3607050 || 8 | c | 236386 | 3624961 | 3607013 |+----+------------+---------+---------------+------------+
xpath (str,concat('/tag/row[',pe.pos+1,']/@*'))
This is a quite clean way to extract all the values of an element together.
What surprised me here it that the order of the attributes does not seem to be according to their order within the XML but in alphabetical order by their names -
@Count,@ExcerptPostId,@Id,@TagName,@WikiPostId
Unfortunately, I can't consider it as a legitimate solution unless I know that the alphabetical attributes order is guaranteed.
select xpath (str,concat('/tag/row[',pe.pos+1,']/@*')) as row_valuesfrom hivexml lateral view posexplode (xpath(str,'/tag/row/@Id')) pe;
--
["244006","3624959","1",".net","3607476"]["602809","3673183","2","html","3673182"]["1274350","3624960","3","javascript","3607052"]["434937","3644670","4","css","3644669"]["1009113","3624936","5","php","3607050"]["236386","3624961","8","c","3607013"]
select row_values[2] as Id ,row_values[3] as TagName ,row_values[0] as Count ,row_values[1] as ExcerptPostId ,row_values[4] as WikiPostIdfrom (select xpath (str,concat('/tag/row[',pe.pos+1,']/@*')) as row_values from hivexml lateral view posexplode (xpath(str,'/tag/row/@Id')) pe ) x;
+----+------------+---------+---------------+------------+| id | tagname | count | excerptpostid | wikipostid |+----+------------+---------+---------------+------------+| 1 | .net | 244006 | 3624959 | 3607476 || 2 | html | 602809 | 3673183 | 3673182 || 3 | javascript | 1274350 | 3624960 | 3607052 || 4 | css | 434937 | 3644670 | 3644669 || 5 | php | 1009113 | 3624936 | 3607050 || 8 | c | 236386 | 3624961 | 3607013 |+----+------------+---------+---------------+------------+
split + str_to_map
select vals["Id"] as Id ,vals["TagName"] as TagName ,vals["Count"] as Count ,vals["ExcerptPostId"] as ExcerptPostId ,vals["WikiPostId"] as WikiPostIdfrom (select str_to_map(e.val,' ','=') as vals from hivexml lateral view posexplode(split(translate(str,'"',''),'/?><row')) e where e.pos <> 0 ) x;
+----+------------+---------+---------------+------------+| id | tagname | count | excerptpostid | wikipostid |+----+------------+---------+---------------+------------+| 1 | .net | 244006 | 3624959 | 3607476 || 2 | html | 602809 | 3673183 | 3673182 || 3 | javascript | 1274350 | 3624960 | 3607052 || 4 | css | 434937 | 3644670 | 3644669 || 5 | php | 1009113 | 3624936 | 3607050 || 8 | c | 236386 | 3624961 | 3607013 |+----+------------+---------+---------------+------------+