From b0d8ed436b855f25b37280b84f9d63a7155284b8 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Mon, 19 Apr 2021 12:03:41 +0200 Subject: [PATCH] PARQUET-2027: Fix calculating directory offset for merge --- .../org/apache/parquet/hadoop/Offsets.java | 8 ++- .../hadoop/TestParquetWriterAppendBlocks.java | 63 +++++++++++++++++- .../src/test/resources/test-append_1.parquet | Bin 0 -> 7375 bytes .../src/test/resources/test-append_2.parquet | Bin 0 -> 7374 bytes 4 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 parquet-hadoop/src/test/resources/test-append_1.parquet create mode 100644 parquet-hadoop/src/test/resources/test-append_2.parquet diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java index fa25943b82..2edc585c4b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java @@ -55,7 +55,7 @@ public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData * (0 cannot be a valid offset because of the MAGIC bytes) * - The firstDataPageOffset might point to the dictionary page */ - dictionaryPageSize = readDictionaryPageSize(input, newChunkStart); + dictionaryPageSize = readDictionaryPageSize(input, chunk); } else { dictionaryPageSize = chunk.getFirstDataPageOffset() - chunk.getDictionaryPageOffset(); } @@ -68,12 +68,14 @@ public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData return new Offsets(firstDataPageOffset, dictionaryPageOffset); } - private static long readDictionaryPageSize(SeekableInputStream in, long pos) throws IOException { + private static long readDictionaryPageSize(SeekableInputStream in, ColumnChunkMetaData chunk) throws IOException { long origPos = -1; try { origPos = in.getPos(); + in.seek(chunk.getStartingPos()); + long headerStart = in.getPos(); PageHeader header = Util.readPageHeader(in); - long headerSize = in.getPos() - origPos; + long headerSize = in.getPos() - headerStart; return headerSize + header.getCompressed_page_size(); } finally { if (origPos != -1) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java index bda5333523..82d48f411d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -38,6 +38,7 @@ import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; @@ -68,6 +69,17 @@ public class TestParquetWriterAppendBlocks { public static final SimpleGroupFactory GROUP_FACTORY = new SimpleGroupFactory(FILE_SCHEMA); + private static final Path STATIC_FILE_1 = createPathFromCP("/test-append_1.parquet"); + private static final Path STATIC_FILE_2 = createPathFromCP("/test-append_2.parquet"); + + private static Path createPathFromCP(String path) { + try { + return new Path(TestParquetWriterAppendBlocks.class.getResource(path).toURI()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + public Path file1; public List file1content = new ArrayList(); public Path file2; @@ -134,6 +146,51 @@ public void testBasicBehavior() throws IOException { Assert.assertEquals("All records should be present", 0, expected.size()); } + /** + * This test is similar to {@link #testBasicBehavior()} only that it uses static files generated by a previous release + * (1.11.1). This test is to validate the fix of PARQUET-2027. + */ + @Test + public void testBasicBehaviorWithStaticFiles() throws IOException { + List expected = new ArrayList<>(); + readAll(STATIC_FILE_1, expected); + readAll(STATIC_FILE_2, expected); + + Path combinedFile = newTemp(); + ParquetFileWriter writer = new ParquetFileWriter( + CONF, FILE_SCHEMA, combinedFile); + writer.start(); + writer.appendFile(CONF, STATIC_FILE_1); + writer.appendFile(CONF, STATIC_FILE_2); + writer.end(EMPTY_METADATA); + + try (ParquetReader reader = ParquetReader + .builder(new GroupReadSupport(), combinedFile) + .build()) { + + for (Group expectedNext : expected) { + Group next = reader.read(); + // check each value; equals is not supported for simple records + Assert.assertEquals("Each id should match", + expectedNext.getInteger("id", 0), next.getInteger("id", 0)); + Assert.assertEquals("Each string should match", + expectedNext.getString("string", 0), next.getString("string", 0)); + } + Assert.assertNull("No extra records should be present", reader.read()); + } + + } + + private void readAll(Path file, List values) throws IOException { + try (ParquetReader reader = ParquetReader + .builder(new GroupReadSupport(), file) + .build()) { + for (Group g = reader.read(); g != null; g = reader.read()) { + values.add(g); + } + } + } + @Test public void testMergedMetadata() throws IOException { Path combinedFile = newTemp(); diff --git a/parquet-hadoop/src/test/resources/test-append_1.parquet b/parquet-hadoop/src/test/resources/test-append_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a255f86ebcf8147712d9f07ed85631e5a738708a GIT binary patch literal 7375 zcma*scYICv|G@ExAV_Q8NQm00b<(qkqGE4BY}KAwAt5%cwnmI-)e5C*rB2ZnZkT{k_i3iT*3b=U5a+97>=hN+BFl8f8!x33vz(qa4bk0xF^sDkFkKR6!DwQ5DsY zg6en#HBb|^P#bkn7xj>eM^PUQ&=8H#7y{Dp7(_^rp+JQO9R^HTXoAPl6wS~aPoM?T z(GsoD8g0-P?a&?_&=H-`8C}p7-H?Iq=z*TdL@)G4AM`~(^v3`U#FH3=r|>jvIB?;? zM}TKA7+DyCp~yxKhG95zF#;no3ZpRwV=)fnF#*rwIZVX!cmXeB5+-8`rXmm1FdZ{6 z6Zx2hm+&%X;}y)oT+G9KEI+vQw zU?VnRGq&I@Y{fQg$J^L}o!EtUup94U5BB0cypIp?A@<=T?8nDAfP*-M!}tV8a1@{7 z7>?rvKEp|z!fAYtGdPQLIFAdsh)cMPEBFFeaShjT12^#{Zs99@jobJJcknIl;vT-k z_xJ%n;wSu!U+^n_!+rdYKkxv5;xGJ-fABy2iQiw-sltEb};2}JW zawv}qsEA6aj0h4@1xZLoRa8R?s^byVKuy#_ZPYr`7%*X>2_8pNG(&Sdffh(dOSD33v_V_6Lwj^UM|47GbU{~iLk7B|2YMnCz0ezd z&=>vC9|JHDPht?B!qc$fz=a1N0iMBNWMK$~A{#jvhT+J?2#mxijK&y@#W;+|1U!r9 zFcHt=1-yt!n2afyiaboibj-j^2F%R>x00mfxMR*m9u>`MSDPG4i zEXNA0#3~fx4Xnl*ti?L4$D7!Ijo5_E*n+pP72B{KZ(|2`Vi(@QZoG><*o*h@K0d&Q z*oTj>A0OiY4&o3F;}aagQGAMHIF1we3@334r|~(?;4IGJJTBlOF5xn+;0s*EHC)FH z+{Bl-g|F~6ZsQx=!MC`Jd-x9D;|KhRpYSt&!LRrY_whUazyth=zwkHy!T<0t!W(}v z#G*LjPy!`U3h^k7GAN4#JcNf)4&_k+6;TP55kVrVAPLE+ifTwfbv%L^sEJyrjXJ1{ zdPv2isE-C{h(>4(0cm&)A|%LAphANV112mq!Q*I(W@wHl&;sdbiB@QhHfW1>Xpau) zh)(E?F6fGG$Ut}WKu=_%7kZ-)`l28DV*m!?NeseMcp5ewxbWa3z%v+(EDXU=WFrT| zFdVrUfsq)6(HMiV7>DtgfM@X>CgOR#fEO_dlQ9KTk%wuRjv1JVe9Xd2cp0mf$rk#p_sxV2oA?s9@D;wsZG3|}_!f6@58vT?{D2?v6Mn`o_!Yn5 zK7Pj^cz{3g7yiaS_#gg7_@ciUVo@A%D1nkFg?N-k8I(l=9>T*Yhw`X^il~Ijh#(PF zkc4DZMKz?LIvzm{)I=@RMjg~eJ*47M)JFp}L?bkYfHXV?5fWr5P@zGG0TULQ;BhoX zGc?B&Xn}OJL@TsL8?;3`v_}VYL??7c7j#88WS~2GpeHiX3%$_?ebEp7F#rScBnIIr zJPjKTTzK#i;28`?7KUIbvXO&f7>-AoK$(Vwv z$ip;D#|+FwK4#%1yo}j+1#>VL^DrL^P=JM4gjcZ`OYj<&;&m*;a;(5gtU@8)z-p|) zTCBr*yon9ih)vjxEqDuCu?^etHg;eqcHtfD#=F>qy?77r;{$w%efS9b@i7kIAP(U$ zKEV+j#iuxi<2ZrOa1y6*8lU3~&f*--;{q?hDQZ;Z?!O?tM@FdR_UDGlG zLk`D@wr;ZuO|TW6ak}CNb|71VZ98njb_2WUx{~b}uI4d;!o^itHw9A-Od*g}(Y6%N z)lD~?PgN|_uw+;8bVCzV*Jq`^8wgC~o2sLWP7vL%B1^jJD3V~=h9Rg@Uf*htJGnpYOMmSFMO-r+EUvNw=sp_uGdB}?3 znLN?495oQd==uW9k~B#aY}s`gXW4@3da~d-f#PeLCmTjI4-7HTbdd=ZMPhxvYzU4n z>ztdT7*-I-q8S~hE1u3ddV=gYCVAyJj1!qOP&hhS*CfddudCXICX3;t8lh87lEzeQ zTadND<46_Ha-(rkeOEDc)~BmviRwAwjwD5}B~5iTO;O~cedxCA$v#tw9HgMCq9<6c zsS5$eqnnN+8+P%~Q{oNG48Sb-m~(M=*7T zB31)mHylw7v}l~^AGX4xIX8ponilc1T!Ub%oGs1G3!-rmHG{IZ6&4=N15Fi75>j9` zT?}MWp=f=9YDCTm%ijg&qI=A%@8R`UDd+#2+u8ins2Kl zpCXbn(fVLsM^z{=mqK7OnkAT`FAIh)iZzY)k#yB=2>6-0YO0dzv56*A&MlpB;|E_Bc06*OE-r zGTk7W_Y!?wvOJ>e5?xgyscEpbAV~r3!*RpM!t08jruq_vYgBR`w8z< zzO6;`KvWd6)Cq*ZWQeMAOK@D(qFtM&rhAfPsL}lfG#k^Bbb)A-2V|pQ1=I(3Ckn!F z>_Ca;3z@EHh7``{GoNSCH#|!fr0}NVdp7AGjk7Dut{hMwvQAS_Esy($tcLeNF>ome zMb?VqXBwuX)0!31oqF@aJP6ArFhxbzL~gl7<8*qDOjUSpct7%FiTIgekr=KLXrf`* z(Qy`IXi0`Z52TN)gv63u+9CDm7$Ut*mU%P&X4&v#%i+S~MQ_i?6}>%Q^2HW46Aok~ zm6~6_(?0b?@mk&M7OT{&UzJ5!i)Cr)zA^LGOt@MqHKk%!#`=_Q@hu8h&Gzny(C%_HT?#6=1=oXXl%Ft6C+!2?S+Y1T24dF;ont(GqQ{&L~5x&7OIKEF=>rux;! z9}5a=cbrym|NZIy*5YF(UzlI9_vwBc3g%Ah)PLoLEh)WIzCW@{tQ24B+l4dSB_|GM zeBJAXh0m&s7pPmRG~6avZC$zB7H!J3lpn4m&UjFB~s}@|EoRQujwGSYMNPcZ1wNVX0P=~zuPG<=fVDKdDXkznV8og zGx_g?{Lx$T{@B$rH+PP>f5uwtdbI_2cRqZiOpjA{M@;^B;<41+ytesgW@NT)vwBdU zNvZp~E^N?Y<=z9gHs+PNl#`v>cv$g*{8@dUY@0D7rl9<&na?{dCoOwWKCeWF83zuv z`!p>+zW@0hYc4GvwBlxuZ}ks`T|ZDE^+36j)i%b}s#PYYo1x!69ozawx^PKLAGqmM zkIrAYNuSj$93DHhXUTT)#}e`*eKH5sJ?d>s`fJv?4f|8-b)PjOyTgDUx2BdaS8rSP z@4N3@SUyF5IezTuhksgoWM7H-b9Zj|WarRgX{A>#THSq5@60yEDleIyy!zbW{Ck6A zf7)KJ&Xt(L%;jx5&8{&saovO+(my3MQ>$ED@6y}nAHB8h-p1fw!GvY2 zgCA}ztJ-+x!Z!+6joWcGFQ?S?H;=}YOS>C;qV=i;m$Ik-eDGN5HIGj3+C@xQadAR} zzEbL|lWvr1GEnN=xnG6DMq%YW!Ix9j+|o~P*_k%zU}n9u4LfOj`VCBKd*6w#)MY>! zuWI|P-`wrd`RwK|YLBl_eD&h?joVzB(rU-W#Pj}%Rtk{BQPwK*4vNlr;TG$^ttA^e^Po3<~Lo}23r^D_NWqj}98iLJ+bSy^68 zB!O>Ajv75OYuGdOVmhZVJn>|?&|1VZF;q^v|*BkL@Q$42sf7<98)oPD zNu~Vpc1~`#ACuHG*B&`yj6Yh)8Ci3zKXO#o@L@HjG)YMlg`Qp0GkUh~mRYmDGbStB ytI1tR4S2=Gs|Q|vh`LRqrM+?=^jtO2JyYbRCb#DX$&HC{807Gc6amNtH$^Z5Gx-|NL&e0Xl|bKlo}UDr84cAH*OG!i}3I{bL~_MGp_ zr$*0K;VX&J#7N|7*+}FXzQuR=9zWnm{DkZH8Nc8LZsJ$`hTriA{=_ZZ#$Wgw|KMLl z;vxsfDGIV5fWr5P@zGG0TUMPLn~zBeze8|$U+-Dh_+~l_UM3) z=!DMbg0AR>?&yJskd2<`h2H3czUYVk7=VEogu!?ik6;KM#ZcIA;KGBC0K+gGIT(SF z7=_UogR#iPIOJhGCSW2aVKS!RG34WMOvN-jfhRE?GcXggP=MK(gSnW8`B;F3ScJt` zf~T+)%di|PP>83o605KpMOcHiScmm^2G3#xHewStV+)?cR%}BtwqpmL$4VXt9T8s;|;utLpY2hIEqpn!*RTYw{Zd|aSEsL4$k0RoW*-M zhxc(FAK(H$#7FoT7jX%n;8R@2XSjmT@ddubSNIzL!#B8!Yxox5;d}gmAMq2e<7fPW z8@P#I@f&`}ANUiua2tQ&Z~TLQ5lNu_Q3hoZj|7xMc_iX4R6s>kLS@{IDyWKTsE!(_ zi71j#3&}`9ZPY<3>f#>MLmKL%0UDwa8Y3M|&=k$k94*ii0y1zfL`aaKK!pY!225DE z53P`i`_UQ?APa5qAljlG+M@$Hq7yo!3%a5kx}ygkLNW5F%{GB1fIln z%)m^{LIGxD4(4JW=3@aCVi6W&37*1IEW>iFKp~#SO02?a6k!e4Vjb4w89a*(*oaNo zj4gN$Td@tr*p3}|9y{>@c40S4um^kbBKF}W?8nPEfP;7iui`bljyLco4&gA4;3!IQ z49D>n-o^=>#3`J{J2-=PaTf349Nx!ye1Hr15Fg=VT*M`Of=_W7pWzBV#~1h#U*T)~ z58vP_uHjpJhwt$Ne#B3>j-T-hZr~<<#c%i>f8bBt!fpJ8zwrh3H09j~*2hkSo&>kJo5uMN(UCcPS z5VFw|z0ezd&=>vC9|JHDgD@Bm;}HzOqZkSs4qSNf5nvdGBL^ce5~DC0V=xxE7>7KJ z#{^8oBuvH>JcfKcj;WZ2C-5YuV+Lko778#Mb1)b4FdqxB5R0%FOYjtyVi}fW1q$&r zR$>)aqX=uT7VEGc&)`{Xz(#DsW^BQ8*oti^#&+z$^Vo?OunW6Uf<4%a7qJg7VLx8R z0UX3DconbVb-aN$aR`TT1V>SdV>pht@HS51Bu?Qp-oY8Xi?etS=kPwx;{#m4hxiB| z<03BM6MTxx_zYL@IljP`_zGX+fA|JhaSh+%JA98H@FRZ0b^MH9a055-D}KZ8_yd39 z7H;D&{EdI`FCyisf0RL4#3KRaP#%f63l&fil~5UXqYA2`8mglPY9fjx)Iu^+P#blS zin_Q5^^k`8Xn=-jgvLlm6EsCLG)D`xgn$g(3lS1zC{Uq6hXE57?n5hN;(oNo1IR)f zJczbvhxX`zj_8EW=z^~3hVJNrhmeh)=!M?sgTCm8{uqFP7=*!i7>{5G9>q}DaNxp& zj{w6k961<)kr;*17=y9M#W>_)JSJcwCSfwB;4$RmaZJTDJb@=M9WyW!vrvH9n1i{P zhxu55g;<2eSc0dp6w9z2D^Q51u@bAW8bw%xwOEJscm~g612$q4He(B(!&YoVF}7m| zp2tqSfL+*)670cVyoh~x3H$Lf4&WeO!K-);uj388i9~@t|6GdYAJ#3s9s=(<6K#GrmwRbJ<+s$!xhxPk_FY46~XpJUl4uCw#k;|Pw<^k$%Vk|*n$B*o&Qa&EF1zN-ZWi}WSIRBT(2wZOAEJkN4t`_)X{ zw*tiy6y4XkxTEqN93GRoqH9`4V94Qh9YuExlMu_27O+0g6C7e9NS>iZp7lO zSf*jgE(dRLzOK(W-wgyeuzi#B6rCV;UD4B2U($6}Z?O-Zq%lR^6C~C0!^rx!riSy0 znqhjrtq5A6*{n|s&qMZwz}Iy#kZs}{Tc09Jy6PyBK*F+LiA|G@f#A8e$ON9Jh*CJ8 zWIKkdd2CP#JVBLp@^O4lHihhs>}(vo)gYz z+lHzdOy$^Nz8GY-Y3sh=`9?TOW7|PE&UZY8bR>~fhx64fO)xFl5)50^Ws&@50pYlS zLo_W(7bK0Ssexw+Rv`HTt1x}jaO^;d?N{|(#ndHH&{eLcdX6tRrbN<68cC)p3h5Tk zCo_W?aAdx(vp(Gp&qHD7jxO7-Y-o}e+lOaqzQ}h{Z!UGNtD;~VA}?jd@D$6@UCFz% z5AxI?F0O5>)SGG14?LZ6;gm#Qmo@E9o{FXuc;UOg%03j!B`%^Q$f9Rzw&Xaj6RVHF z^4O_w2%2sXXHzl+TUQ9XuIavGIlkn?u4`(BEr+Y}REkq|6;-fRDvwlgWSWK{DzW?v zH>=W#1l{w)e!zZBQlK{4~jHwO@ggZB&x1Bf*p`Aw(ZbdY|80Q{&}{< z;nOWt*Ngo_9q9_mud)%t5viEi`gB{SY^Vy61gAfX^gq|sg@AI=O~;W9JGNiSB(O9~ z5gbP(&W0%qwn)F%0^d{=)vydFmZz3%do&S4AO~5WO6gjXOF9x?#}JuQj^(N5n@nei z*-5Kcbyud2SgvPskesg?+{sfC!u1T(6nxVp4{SnbTik4X)wGpBv{mm;otqkEz_i`-%xaVH?g7dg#E>n zCF;Wri_UPBKobqaj@>^b`h#S7oR-UJsS?RW=>pl}fXdkIsn9uimoTmzh;-+&@F@*{l7)8WV*9{WAM(8J#+A`1-k{KHJu9 zsMMx@!zD!>?peLHa_u>V%bImAoxgJBs`S|ti=zi-rsoYTcVWV)OZgo?n!di(-bL=v zN(Bctj%Z_V+Bw*$u(KfRV=?K_M)Tb7{R-j+)hns6Dt&e2V&mhFZJ&Lk{_&wNlw8Z6 zH@RQ&f+u#)?67;|oNXs(d{j7b@ugS#9BOnVx7)cD`}&u4&)h2P*L=v3Ut2djKcKoi zbVi>psp8I_S;2{(1$jL z`LfH@!5`~KY7MG&DPdT`;)>1UuUu@NaImP#hIJ3lE?E*JZY^m%_~x?d1v%YP+b$f} z^Kc8Z*`$oVM#5!d`t@5Y`%QVKe*Tg3rSwH#N>5e3+v}WGv(;M*N(=96eXQW=0?oYQjkq$rQt`Pi3olGxn_jp3+jE*FEzgcW zJUlKxH-BBWS$d{;(14=+4uj1lCyQ%tol}^yw{T!w-JbqE9TJ9FaV5VU-}e0M_i}LK^_q-(4 zTr@1B&-|qyt|&crF+b=~_49=-2NkLbwQDY(62EfZw5%@ON6ebvZmK!wNM6=N@rqs0 z^RzgB9A%UR1l>K(%D#Rq_4dJ60W?U(~gz-qLot*$wP*=`}VL_lP_2 z>8vhUr)%bvZ~bATr4 z(>Rh`CdZ3JEAgLl<0s_hj2YHA(k+$YNoT91q^7h?PfX<-8z-`I21dg#Co%WU*6E3v zclL5;V@dqYDcsqeS6rCtXMX0{KPsA(ba6y9Ba;4C>(po^CW}w`|BKz3@N$c!%Pn{} zQ;#(LPaFMjQH!Ls4xjPm!e4qoT=4^F~H@#YK|NjbJmZG`ZU3yqpPsUdHIL zoM&tFSa`X literal 0 HcmV?d00001