@@ -186,6 +186,15 @@ pub fn extract_metadata(headers: &HeaderMap<HeaderValue>) -> HashMap<String, Str
186
186
187
187
/// Extracts metadata from headers and returns it as a HashMap.
188
188
pub fn extract_metadata_from_mime ( headers : & HeaderMap < HeaderValue > , metadata : & mut HashMap < String , String > ) {
189
+ extract_metadata_from_mime_with_object_name ( headers, metadata, None ) ;
190
+ }
191
+
192
+ /// Extracts metadata from headers and returns it as a HashMap with object name for MIME type detection.
193
+ pub fn extract_metadata_from_mime_with_object_name (
194
+ headers : & HeaderMap < HeaderValue > ,
195
+ metadata : & mut HashMap < String , String > ,
196
+ object_name : Option < & str > ,
197
+ ) {
189
198
for ( k, v) in headers. iter ( ) {
190
199
if let Some ( key) = k. as_str ( ) . strip_prefix ( "x-amz-meta-" ) {
191
200
if key. is_empty ( ) {
@@ -210,8 +219,40 @@ pub fn extract_metadata_from_mime(headers: &HeaderMap<HeaderValue>, metadata: &m
210
219
}
211
220
212
221
if !metadata. contains_key ( "content-type" ) {
213
- metadata. insert ( "content-type" . to_owned ( ) , "binary/octet-stream" . to_owned ( ) ) ;
222
+ let default_content_type = if let Some ( obj_name) = object_name {
223
+ detect_content_type_from_object_name ( obj_name)
224
+ } else {
225
+ "binary/octet-stream" . to_owned ( )
226
+ } ;
227
+ metadata. insert ( "content-type" . to_owned ( ) , default_content_type) ;
228
+ }
229
+ }
230
+
231
+ /// Detects content type from object name based on file extension.
232
+ pub ( crate ) fn detect_content_type_from_object_name ( object_name : & str ) -> String {
233
+ let lower_name = object_name. to_lowercase ( ) ;
234
+
235
+ // Check for Parquet files specifically
236
+ if lower_name. ends_with ( ".parquet" ) {
237
+ return "application/vnd.apache.parquet" . to_owned ( ) ;
238
+ }
239
+
240
+ // Special handling for other data formats that mime_guess doesn't know
241
+ if lower_name. ends_with ( ".avro" ) {
242
+ return "application/avro" . to_owned ( ) ;
214
243
}
244
+ if lower_name. ends_with ( ".orc" ) {
245
+ return "application/orc" . to_owned ( ) ;
246
+ }
247
+ if lower_name. ends_with ( ".feather" ) {
248
+ return "application/feather" . to_owned ( ) ;
249
+ }
250
+ if lower_name. ends_with ( ".arrow" ) {
251
+ return "application/arrow" . to_owned ( ) ;
252
+ }
253
+
254
+ // Use mime_guess for standard file types
255
+ mime_guess:: from_path ( object_name) . first_or_octet_stream ( ) . to_string ( )
215
256
}
216
257
217
258
/// List of supported headers.
@@ -646,4 +687,80 @@ mod tests {
646
687
assert_eq ! ( metadata. get( "cache-control" ) , Some ( & "public" . to_string( ) ) ) ;
647
688
assert ! ( !metadata. contains_key( "authorization" ) ) ;
648
689
}
690
+
691
+ #[ test]
692
+ fn test_extract_metadata_from_mime_with_parquet_object_name ( ) {
693
+ let headers = HeaderMap :: new ( ) ;
694
+ let mut metadata = HashMap :: new ( ) ;
695
+
696
+ extract_metadata_from_mime_with_object_name ( & headers, & mut metadata, Some ( "data/test.parquet" ) ) ;
697
+
698
+ assert_eq ! ( metadata. get( "content-type" ) , Some ( & "application/vnd.apache.parquet" . to_string( ) ) ) ;
699
+ }
700
+
701
+ #[ test]
702
+ fn test_extract_metadata_from_mime_with_various_data_formats ( ) {
703
+ let test_cases = vec ! [
704
+ ( "data.parquet" , "application/vnd.apache.parquet" ) ,
705
+ ( "data.PARQUET" , "application/vnd.apache.parquet" ) , // 测试大小写不敏感
706
+ ( "file.avro" , "application/avro" ) ,
707
+ ( "file.orc" , "application/orc" ) ,
708
+ ( "file.feather" , "application/feather" ) ,
709
+ ( "file.arrow" , "application/arrow" ) ,
710
+ ( "file.json" , "application/json" ) ,
711
+ ( "file.csv" , "text/csv" ) ,
712
+ ( "file.txt" , "text/plain" ) ,
713
+ ( "file.unknownext" , "application/octet-stream" ) , // 使用真正未知的扩展名
714
+ ] ;
715
+
716
+ for ( filename, expected_content_type) in test_cases {
717
+ let headers = HeaderMap :: new ( ) ;
718
+ let mut metadata = HashMap :: new ( ) ;
719
+
720
+ extract_metadata_from_mime_with_object_name ( & headers, & mut metadata, Some ( filename) ) ;
721
+
722
+ assert_eq ! (
723
+ metadata. get( "content-type" ) ,
724
+ Some ( & expected_content_type. to_string( ) ) ,
725
+ "Failed for filename: {}" ,
726
+ filename
727
+ ) ;
728
+ }
729
+ }
730
+
731
+ #[ test]
732
+ fn test_extract_metadata_from_mime_with_existing_content_type ( ) {
733
+ let mut headers = HeaderMap :: new ( ) ;
734
+ headers. insert ( "content-type" , HeaderValue :: from_static ( "custom/type" ) ) ;
735
+
736
+ let mut metadata = HashMap :: new ( ) ;
737
+ extract_metadata_from_mime_with_object_name ( & headers, & mut metadata, Some ( "test.parquet" ) ) ;
738
+
739
+ // 应该保留现有的 content-type,不被覆盖
740
+ assert_eq ! ( metadata. get( "content-type" ) , Some ( & "custom/type" . to_string( ) ) ) ;
741
+ }
742
+
743
+ #[ test]
744
+ fn test_detect_content_type_from_object_name ( ) {
745
+ // 测试 Parquet 文件(我们的自定义处理)
746
+ assert_eq ! ( detect_content_type_from_object_name( "test.parquet" ) , "application/vnd.apache.parquet" ) ;
747
+ assert_eq ! ( detect_content_type_from_object_name( "TEST.PARQUET" ) , "application/vnd.apache.parquet" ) ;
748
+
749
+ // 测试其他自定义数据格式
750
+ assert_eq ! ( detect_content_type_from_object_name( "data.avro" ) , "application/avro" ) ;
751
+ assert_eq ! ( detect_content_type_from_object_name( "data.orc" ) , "application/orc" ) ;
752
+ assert_eq ! ( detect_content_type_from_object_name( "data.feather" ) , "application/feather" ) ;
753
+ assert_eq ! ( detect_content_type_from_object_name( "data.arrow" ) , "application/arrow" ) ;
754
+
755
+ // 测试标准格式(mime_guess 处理)
756
+ assert_eq ! ( detect_content_type_from_object_name( "data.json" ) , "application/json" ) ;
757
+ assert_eq ! ( detect_content_type_from_object_name( "data.csv" ) , "text/csv" ) ;
758
+ assert_eq ! ( detect_content_type_from_object_name( "data.txt" ) , "text/plain" ) ;
759
+
760
+ // 测试真正未知的格式(使用一个 mime_guess 不认识的扩展名)
761
+ assert_eq ! ( detect_content_type_from_object_name( "unknown.unknownext" ) , "application/octet-stream" ) ;
762
+
763
+ // 测试没有扩展名的文件
764
+ assert_eq ! ( detect_content_type_from_object_name( "noextension" ) , "application/octet-stream" ) ;
765
+ }
649
766
}
0 commit comments