Skip to content

Commit b316448

Browse files
author
Michel Davit
committed
Update parquet example
1 parent 433f709 commit b316448

File tree

2 files changed

+24
-13
lines changed

2 files changed

+24
-13
lines changed

scio-examples/src/main/scala/com/spotify/scio/examples/extra/ParquetExample.scala

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ object ParquetExample {
4141
* These case classes represent both full and projected field mappings from the [[Account]] Avro
4242
* record.
4343
*/
44-
case class AccountFull(id: Int, `type`: String, name: Option[String], amount: Double)
44+
case class AccountFull(
45+
id: Int,
46+
`type`: String,
47+
name: Option[String],
48+
amount: Double,
49+
accountStatus: Option[AccountStatus]
50+
)
4551
case class AccountProjection(id: Int, name: Option[String])
4652

4753
/**
@@ -108,21 +114,19 @@ object ParquetExample {
108114

109115
private def avroSpecificIn(sc: ScioContext, args: Args): ClosedTap[String] = {
110116
// Macros for generating column projections and row predicates
111-
val projection = Projection[Account](_.getId, _.getName, _.getAmount)
117+
// account_status is the only field with default value that can be left out the projection
118+
val projection = Projection[Account](_.getId, _.getType, _.getName, _.getAmount)
112119
val predicate = Predicate[Account](x => x.getAmount > 0)
113120

114121
sc.parquetAvroFile[Account](args("input"), projection, predicate)
115-
// The result Account records are not complete Avro objects. Only the projected columns are present while the rest are null.
116-
// These objects may fail serialization and it’s recommended that you map them out to tuples or case classes right after reading.
117-
.map(x => AccountProjection(x.getId, Some(x.getName.toString)))
118122
.saveAsTextFile(args("output"))
119123
}
120124

121125
private def avroGenericIn(sc: ScioContext, args: Args): ClosedTap[String] = {
122126
val schema = Account.getClassSchema
123127
implicit val genericRecordCoder: Coder[GenericRecord] = avroGenericRecordCoder(schema)
124128

125-
val parquetIn = sc.parquetAvroFile[GenericRecord](args("input"), schema)
129+
val parquetIn = sc.parquetAvroGenericRecordFile(args("input"), schema)
126130

127131
// Catches a specific bug with encoding GenericRecords read by parquet-avro
128132
parquetIn
@@ -146,12 +150,19 @@ object ParquetExample {
146150
// but close to `parquet.block.size`, i.e. 1 GiB. This guarantees that each file contains 1 row group only and reduces seeks.
147151
.saveAsParquetAvroFile(args("output"), numShards = 1, conf = fineTunedParquetWriterConfig)
148152

153+
private[extra] def toScalaFull(account: Account): AccountFull =
154+
AccountFull(
155+
account.getId,
156+
account.getType.toString,
157+
Some(account.getName.toString),
158+
account.getAmount,
159+
Some(account.getAccountStatus)
160+
)
161+
149162
private def typedOut(sc: ScioContext, args: Args): ClosedTap[AccountFull] =
150163
sc.parallelize(fakeData)
151-
.map(x => AccountFull(x.getId, x.getType.toString, Some(x.getName.toString), x.getAmount))
152-
.saveAsTypedParquetFile(
153-
args("output")
154-
)
164+
.map(toScalaFull)
165+
.saveAsTypedParquetFile(args("output"))
155166

156167
private[extra] def toExample(account: Account): Example = {
157168
val amount = Feature

scio-examples/src/test/scala/com/spotify/scio/examples/extra/ParquetExampleTest.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ class ParquetExampleTest extends PipelineSpec {
4646

4747
"ParquetExample" should "work for SpecificRecord input" in {
4848
val expected = ParquetExample.fakeData
49-
.map(x => AccountProjection(x.getId, Some(x.getName.toString)))
49+
// set default value on field outside projection
50+
.map(x => Account.newBuilder(x).setAccountStatus(null).build())
5051
.map(_.toString)
5152

5253
JobTest[com.spotify.scio.examples.extra.ParquetExample.type]
@@ -79,8 +80,7 @@ class ParquetExampleTest extends PipelineSpec {
7980
}
8081

8182
it should "work for typed output" in {
82-
val expected = ParquetExample.fakeData
83-
.map(a => AccountFull(a.getId, a.getType.toString, Some(a.getName.toString), a.getAmount))
83+
val expected = ParquetExample.fakeData.map(ParquetExample.toScalaFull)
8484

8585
JobTest[com.spotify.scio.examples.extra.ParquetExample.type]
8686
.args("--output=out.parquet", "--method=typedOut")

0 commit comments

Comments
 (0)