Skip to content

Commit fe2e4a2

Browse files
authored
Merge pull request #367 from guojidan/fix-sql
feat: enhance metadata extraction with object name for MIME type dete…
2 parents c55c7a6 + b391272 commit fe2e4a2

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

rustfs/src/storage/ecfs.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use crate::error::ApiError;
2121
use crate::storage::access::ReqInfo;
2222
use crate::storage::options::copy_dst_opts;
2323
use crate::storage::options::copy_src_opts;
24-
use crate::storage::options::{extract_metadata_from_mime, get_opts};
24+
use crate::storage::options::{extract_metadata_from_mime_with_object_name, get_opts};
2525
use bytes::Bytes;
2626
use chrono::DateTime;
2727
use chrono::Utc;
@@ -1412,7 +1412,7 @@ impl S3 for FS {
14121412

14131413
let mut metadata = metadata.unwrap_or_default();
14141414

1415-
extract_metadata_from_mime(&req.headers, &mut metadata);
1415+
extract_metadata_from_mime_with_object_name(&req.headers, &mut metadata, Some(&key));
14161416

14171417
if let Some(tags) = tagging {
14181418
metadata.insert(AMZ_OBJECT_TAGGING.to_owned(), tags);

rustfs/src/storage/options.rs

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,15 @@ pub fn extract_metadata(headers: &HeaderMap<HeaderValue>) -> HashMap<String, Str
186186

187187
/// Extracts metadata from headers and returns it as a HashMap.
188188
pub fn extract_metadata_from_mime(headers: &HeaderMap<HeaderValue>, metadata: &mut HashMap<String, String>) {
189+
extract_metadata_from_mime_with_object_name(headers, metadata, None);
190+
}
191+
192+
/// Extracts metadata from headers and returns it as a HashMap with object name for MIME type detection.
193+
pub fn extract_metadata_from_mime_with_object_name(
194+
headers: &HeaderMap<HeaderValue>,
195+
metadata: &mut HashMap<String, String>,
196+
object_name: Option<&str>,
197+
) {
189198
for (k, v) in headers.iter() {
190199
if let Some(key) = k.as_str().strip_prefix("x-amz-meta-") {
191200
if key.is_empty() {
@@ -210,8 +219,40 @@ pub fn extract_metadata_from_mime(headers: &HeaderMap<HeaderValue>, metadata: &m
210219
}
211220

212221
if !metadata.contains_key("content-type") {
213-
metadata.insert("content-type".to_owned(), "binary/octet-stream".to_owned());
222+
let default_content_type = if let Some(obj_name) = object_name {
223+
detect_content_type_from_object_name(obj_name)
224+
} else {
225+
"binary/octet-stream".to_owned()
226+
};
227+
metadata.insert("content-type".to_owned(), default_content_type);
228+
}
229+
}
230+
231+
/// Detects content type from object name based on file extension.
232+
pub(crate) fn detect_content_type_from_object_name(object_name: &str) -> String {
233+
let lower_name = object_name.to_lowercase();
234+
235+
// Check for Parquet files specifically
236+
if lower_name.ends_with(".parquet") {
237+
return "application/vnd.apache.parquet".to_owned();
238+
}
239+
240+
// Special handling for other data formats that mime_guess doesn't know
241+
if lower_name.ends_with(".avro") {
242+
return "application/avro".to_owned();
214243
}
244+
if lower_name.ends_with(".orc") {
245+
return "application/orc".to_owned();
246+
}
247+
if lower_name.ends_with(".feather") {
248+
return "application/feather".to_owned();
249+
}
250+
if lower_name.ends_with(".arrow") {
251+
return "application/arrow".to_owned();
252+
}
253+
254+
// Use mime_guess for standard file types
255+
mime_guess::from_path(object_name).first_or_octet_stream().to_string()
215256
}
216257

217258
/// List of supported headers.
@@ -646,4 +687,80 @@ mod tests {
646687
assert_eq!(metadata.get("cache-control"), Some(&"public".to_string()));
647688
assert!(!metadata.contains_key("authorization"));
648689
}
690+
691+
#[test]
692+
fn test_extract_metadata_from_mime_with_parquet_object_name() {
693+
let headers = HeaderMap::new();
694+
let mut metadata = HashMap::new();
695+
696+
extract_metadata_from_mime_with_object_name(&headers, &mut metadata, Some("data/test.parquet"));
697+
698+
assert_eq!(metadata.get("content-type"), Some(&"application/vnd.apache.parquet".to_string()));
699+
}
700+
701+
#[test]
702+
fn test_extract_metadata_from_mime_with_various_data_formats() {
703+
let test_cases = vec![
704+
("data.parquet", "application/vnd.apache.parquet"),
705+
("data.PARQUET", "application/vnd.apache.parquet"), // 测试大小写不敏感
706+
("file.avro", "application/avro"),
707+
("file.orc", "application/orc"),
708+
("file.feather", "application/feather"),
709+
("file.arrow", "application/arrow"),
710+
("file.json", "application/json"),
711+
("file.csv", "text/csv"),
712+
("file.txt", "text/plain"),
713+
("file.unknownext", "application/octet-stream"), // 使用真正未知的扩展名
714+
];
715+
716+
for (filename, expected_content_type) in test_cases {
717+
let headers = HeaderMap::new();
718+
let mut metadata = HashMap::new();
719+
720+
extract_metadata_from_mime_with_object_name(&headers, &mut metadata, Some(filename));
721+
722+
assert_eq!(
723+
metadata.get("content-type"),
724+
Some(&expected_content_type.to_string()),
725+
"Failed for filename: {}",
726+
filename
727+
);
728+
}
729+
}
730+
731+
#[test]
732+
fn test_extract_metadata_from_mime_with_existing_content_type() {
733+
let mut headers = HeaderMap::new();
734+
headers.insert("content-type", HeaderValue::from_static("custom/type"));
735+
736+
let mut metadata = HashMap::new();
737+
extract_metadata_from_mime_with_object_name(&headers, &mut metadata, Some("test.parquet"));
738+
739+
// 应该保留现有的 content-type,不被覆盖
740+
assert_eq!(metadata.get("content-type"), Some(&"custom/type".to_string()));
741+
}
742+
743+
#[test]
744+
fn test_detect_content_type_from_object_name() {
745+
// 测试 Parquet 文件(我们的自定义处理)
746+
assert_eq!(detect_content_type_from_object_name("test.parquet"), "application/vnd.apache.parquet");
747+
assert_eq!(detect_content_type_from_object_name("TEST.PARQUET"), "application/vnd.apache.parquet");
748+
749+
// 测试其他自定义数据格式
750+
assert_eq!(detect_content_type_from_object_name("data.avro"), "application/avro");
751+
assert_eq!(detect_content_type_from_object_name("data.orc"), "application/orc");
752+
assert_eq!(detect_content_type_from_object_name("data.feather"), "application/feather");
753+
assert_eq!(detect_content_type_from_object_name("data.arrow"), "application/arrow");
754+
755+
// 测试标准格式(mime_guess 处理)
756+
assert_eq!(detect_content_type_from_object_name("data.json"), "application/json");
757+
assert_eq!(detect_content_type_from_object_name("data.csv"), "text/csv");
758+
assert_eq!(detect_content_type_from_object_name("data.txt"), "text/plain");
759+
760+
// 测试真正未知的格式(使用一个 mime_guess 不认识的扩展名)
761+
assert_eq!(detect_content_type_from_object_name("unknown.unknownext"), "application/octet-stream");
762+
763+
// 测试没有扩展名的文件
764+
assert_eq!(detect_content_type_from_object_name("noextension"), "application/octet-stream");
765+
}
649766
}

0 commit comments

Comments
 (0)