Find the maximum occurring word in a given set of strings
Given a huge set of words with duplicates present, find the maximum occurring word in it. If two words have the same count, return any one of them.
For example,
keys = [code, coder, coding, codable, codec, codecs, coded, codeless, codec, codecs, codependence, codex, codify, codependents, codes, code, coder, codesign, codec, codeveloper, codrive, codec, codecs, codiscovered]
Output:
The maximum occurring word is codec.
Its count is 4
The idea is to use Trie (Prefix Tree) to solve this problem. We start by inserting each key into the Trie and store its count so far (along with the key itself) in the leaf nodes. After all nodes are inserted into the Trie, perform its preorder traversal (DFS), and find the maximum frequency word by comparing the count present at leaf nodes. Note that we can also use a map to solve this problem.
Following is the C++, Java, and Python implementation of the idea:
C++
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#include <iostream> #include <unordered_map> #include <string> using namespace std; // Data structure to store a Trie node struct Trie { // `count` and `key` is only set for leaf nodes // `key` stores the string, and `count` stores its frequency so far string key; int count = 0; // each node stores a map to its child nodes unordered_map<char, Trie*> character; }; // Iterative function to insert a string into a Trie void insert(Trie* const &head, string const &str) { // start from the root node Trie* curr = head; for (char ch: str) { // create a new node if the path doesn't exist if (curr->character.find(ch) == curr->character.end()) { curr->character[ch] = new Trie(); } // go to the next node curr = curr->character[ch]; } // store key and its count in leaf nodes curr->key = str; curr->count += 1; } // Function to perform preorder traversal on given Trie // and find a word with the maximum frequency void preorder(Trie* const curr, int &max_count, string &key) { // base condition if (curr == nullptr) { return; } for (auto pair: curr->character) { // leaf nodes have a non-zero count if (max_count < pair.second->count ) { key = pair.second->key; max_count = pair.second->count; } // recur for current node's children preorder(pair.second, max_count, key); } } int main() { // given set of keys string words[] = { "code", "coder", "coding", "codable", "codec", "codecs", "coded", "codeless", "codec", "codecs", "codependence", "codex", "codify", "codependents", "codes", "code", "coder", "codesign", "codec", "codeveloper", "codrive", "codec", "codecs", "codiscovered" }; // insert all keys into a Trie Trie* head = new Trie(); for (string word: words) { insert(head, word); } int count = 0; string key; // perform preorder traversal on a Trie and find the key // with maximum frequency preorder(head, count, key); cout << "Word : " << key << endl; cout << "Count: " << count << endl; return 0; } |
Output:
Word : codec
Count: 4
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; // A class to store a Trie node class TrieNode { // `count` and `key` is only set for leaf nodes // `key` stores the string, and `count` stores its frequency so far String key; int count; // each node stores a map to its child nodes Map<Character, TrieNode> character = null; // Constructor TrieNode() { character = new HashMap<>(); } } class Main { // Iterative function to insert a string into a Trie public static void insert(TrieNode head, String str) { // start from the root node TrieNode curr = head; for (char c: str.toCharArray()) { // create a new node if the path doesn't exist curr.character.putIfAbsent(c, new TrieNode()); // go to the next node curr = curr.character.get(c); } // store key and its count in leaf nodes curr.key = str; curr.count += 1; } // Function to perform preorder traversal on a Trie and // find a word with the maximum frequency public static int preorder(TrieNode curr, int maxCount, StringBuilder key) { // return false if Trie is empty if (curr == null) { return maxCount; } for (var entry: curr.character.entrySet()) { // leaf nodes have a non-zero count if (maxCount < entry.getValue().count) { key.replace(0, key.length(), entry.getValue().key); maxCount = entry.getValue().count; } // recur for current node's children maxCount = preorder(entry.getValue(), maxCount, key); } return maxCount; } public static void main(String[] args) { // given set of keys List<String> words = Arrays.asList( "code", "coder", "coding", "codable", "codec", "codecs", "coded", "codeless", "codec", "codecs", "codependence", "codex", "codify", "codependents", "codes", "code", "coder", "codesign", "codec", "codeveloper", "codrive", "codec", "codecs", "codiscovered" ); // Insert all keys into a Trie TrieNode head = new TrieNode(); for (String word: words) { insert(head, word); } int count = 0; StringBuilder key = new StringBuilder(); // perform preorder traversal on a Trie and find the key // with maximum frequency count = preorder(head, count, key); System.out.println("Word : " + key); System.out.println("Count: " + count); } } |
Output:
Word : codec
Count: 4
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# A class to store a Trie node class TrieNode: def __init__(self): # `count` and `key` is only set for leaf nodes # `key` stores the string, and `count` stores its frequency so far self.key = None self.count = 0 # each node stores a dictionary to its child nodes self.character = {} # Iterative function to insert a string into a Trie def insert(head, s): # start from the root node curr = head for c in s: # go to the next node and create a new node if the path doesn't exist curr = curr.character.setdefault(c, TrieNode()) # store key and its count in leaf nodes curr.key = s curr.count += 1 # Function to perform preorder traversal on a Trie and # find a word with the maximum frequency def preorder(curr, key='', max_count=0): # return false if Trie is empty if curr is None: return key, max_count for (k, v) in curr.character.items(): # leaf node has a non-zero count if max_count < v.count: key = v.key max_count = v.count # recur for current node's children key, max_count = preorder(v, key, max_count) return key, max_count if __name__ == '__main__': # given set of keys words = [ 'code', 'coder', 'coding', 'codable', 'codec', 'codecs', 'coded', 'codeless', 'codec', 'codecs', 'codependence', 'codex', 'codify', 'codependents', 'codes', 'code', 'coder', 'codesign', 'codec', 'codeveloper', 'codrive', 'codec', 'codecs', 'codiscovered' ] # Insert all keys into a Trie head = TrieNode() for word in words: insert(head, word) # perform preorder traversal on a Trie and find the key # with a maximum frequency key, count = preorder(head) print('Word :', key) print('Count:', count) |
Output:
Word : codec
Count: 4
The time complexity of the above solution is O(N.M), where N
is the total number of given words and M
is the maximum word length. The auxiliary space required by the program is O(N × M).
Exercise:
1. Extend this solution to print all maximum occurring words (having the same count).
2. Extend this solution to print the first k
maximum occurring word.
Find first `k` maximum occurring words in a given set of strings
Longest Common Prefix in a given set of strings (Using Trie)
Thanks for reading.
To share your code in the comments, please use our online compiler that supports C, C++, Java, Python, JavaScript, C#, PHP, and many more popular programming languages.
Like us? Refer us to your friends and support our growth. Happy coding :)