|
| 1 | +using System.Collections.Generic; |
| 2 | +using System.Linq; |
| 3 | + |
| 4 | +namespace DataStructures |
| 5 | +{ |
| 6 | + /// <summary> |
| 7 | + /// Inverted index is the simplest form of document indexing, |
| 8 | + /// allowing performing boolean queries on text data. |
| 9 | + /// |
| 10 | + /// This realization is just simplified for better understanding the process of indexing |
| 11 | + /// and working on straightforward string inputs. |
| 12 | + /// </summary> |
| 13 | + public class InvertedIndex |
| 14 | + { |
| 15 | + private readonly Dictionary<string, List<string>> invertedIndex = new(); |
| 16 | + |
| 17 | + /// <summary> |
| 18 | + /// Build inverted index with source name and source content. |
| 19 | + /// </summary> |
| 20 | + /// <param name="sourceName">Name of the source.</param> |
| 21 | + /// <param name="sourceContent">Content of the source.</param> |
| 22 | + public void AddToIndex(string sourceName, string sourceContent) |
| 23 | + { |
| 24 | + var context = sourceContent.Split(' ').Distinct(); |
| 25 | + foreach (var word in context) |
| 26 | + { |
| 27 | + if (!invertedIndex.ContainsKey(word)) |
| 28 | + { |
| 29 | + invertedIndex.Add(word, new List<string> { sourceName }); |
| 30 | + } |
| 31 | + else |
| 32 | + { |
| 33 | + invertedIndex[word].Add(sourceName); |
| 34 | + } |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + /// <summary> |
| 39 | + /// Returns the source names contains ALL terms inside at same time. |
| 40 | + /// </summary> |
| 41 | + /// <param name="terms">List of terms.</param> |
| 42 | + /// <returns>Source names.</returns> |
| 43 | + public IEnumerable<string> And(IEnumerable<string> terms) |
| 44 | + { |
| 45 | + var entries = terms |
| 46 | + .Select(term => invertedIndex |
| 47 | + .Where(x => x.Key.Equals(term)) |
| 48 | + .SelectMany(x => x.Value)) |
| 49 | + .ToList(); |
| 50 | + |
| 51 | + var intersection = entries |
| 52 | + .Skip(1) |
| 53 | + .Aggregate(new HashSet<string>(entries.First()), (hashSet, enumerable) => |
| 54 | + { |
| 55 | + hashSet.IntersectWith(enumerable); |
| 56 | + return hashSet; |
| 57 | + }); |
| 58 | + |
| 59 | + return intersection; |
| 60 | + } |
| 61 | + |
| 62 | + /// <summary> |
| 63 | + /// Returns the source names contains AT LEAST ONE from terms inside. |
| 64 | + /// </summary> |
| 65 | + /// <param name="terms">List of terms.</param> |
| 66 | + /// <returns>Source names.</returns> |
| 67 | + public IEnumerable<string> Or(IEnumerable<string> terms) |
| 68 | + { |
| 69 | + var sources = new List<string>(); |
| 70 | + foreach (var term in terms) |
| 71 | + { |
| 72 | + var source = invertedIndex |
| 73 | + .Where(x => x.Key.Equals(term)) |
| 74 | + .SelectMany(x => x.Value); |
| 75 | + |
| 76 | + sources.AddRange(source); |
| 77 | + } |
| 78 | + |
| 79 | + return sources.Distinct(); |
| 80 | + } |
| 81 | + } |
| 82 | +} |
0 commit comments