@@ -9,52 +9,29 @@ public class _609 {
9
9
10
10
public static class Solution1 {
11
11
public List <List <String >> findDuplicate (String [] paths ) {
12
- Map <String , List <String >> map = new HashMap <>();//key is the file content, value is the list of directories that has this directory/file
12
+ Map <String , List <String >> contentMap = new HashMap <>();
13
13
for (String path : paths ) {
14
- String [] dirAndFiles = path .split (" " );
15
- for (int i = 1 ; i < dirAndFiles .length ; i ++) {
16
- String content = dirAndFiles [i ].substring (dirAndFiles [i ].indexOf ("(" ) + 1 , dirAndFiles [i ].indexOf (")" ));
17
- if (!map .containsKey (content )) {
18
- map .put (content , new ArrayList <>());
14
+ String [] contents = path .split (" " );
15
+ List <String > list = new ArrayList <>();
16
+ for (int i = 1 ; i < contents .length ; i ++) {
17
+ list .add (contents [i ]);
18
+ int start = contents [i ].indexOf ('(' );
19
+ int end = contents [i ].indexOf (')' );
20
+ String content = contents [i ].substring (start + 1 , end );
21
+ if (!contentMap .containsKey (content )) {
22
+ contentMap .put (content , new ArrayList <>());
19
23
}
20
- List <String > dirs = map .get (content );
21
- dirs .add (dirAndFiles [0 ] + "/" + dirAndFiles [i ].substring (0 , dirAndFiles [i ].indexOf ("(" )));
22
- map .put (content , dirs );
24
+ List <String > dupFiles = contentMap .get (content );
25
+ dupFiles .add (contents [0 ] + "/" + contents [i ].substring (0 , start ));
23
26
}
24
27
}
25
-
26
28
List <List <String >> result = new ArrayList <>();
27
- for (String content : map .keySet ()) {
28
- if (map .get (content ).size () > 1 ) {
29
- List <String > dupFile = new ArrayList <>();
30
- for (String dir : map .get (content )) {
31
- dupFile .add (dir );
32
- }
33
- result .add (dupFile );
29
+ for (String key : contentMap .keySet ()) {
30
+ if (contentMap .get (key ).size () > 1 ) {
31
+ result .add (contentMap .get (key ));
34
32
}
35
33
}
36
34
return result ;
37
35
}
38
36
}
39
-
40
- /**Answers to follow-up questions:
41
- * 1. Imagine you are given a real file system, how will you search files? DFS or BFS ?
42
- * A: Both BFS and DFS could do the work, but BFS will use extra memory, however, BFS takes advantage of memory locality, so BFS could be faster.
43
- *
44
- * 2. If the file content is very large (GB level), how will you modify your solution?
45
- * A: We'll fist map all files according to their sizes, since files with different sizes are guaranteed to be different, then
46
- * we can hash a small part of the files using MD5, SHA256, etc. Only when their md5 or sha256 is the same, we'll compare the contents byte by byte.
47
- *
48
- * 3. If you can only read the file by 1kb each time, how will you modify your solution?
49
- * A: This is not going to change the solution, we can hash this 1kb chunk, and then also only compare byte by byte when it's necessary.
50
- *
51
- * 4. What is the time complexity of your modified solution? What is the most time consuming part and memory consuming part of it? How to optimize?
52
- * A: O(n^2*k), in the worst time, we'll have to compare the file with every other file, k is the length of the file.
53
- * Comparing the file (by size, by hash and eventually byte by byte) is the most time consuming part.
54
- * Generating hash for every file will be the most memory consuming part.
55
- *
56
- * 5. How to make sure the duplicated files you find are not false positive?
57
- * A: Size comparision, hash detection, byte by byte check, etc. will pretty sure to rule out false positive.
58
- * */
59
-
60
37
}
0 commit comments