diff --git a/347. Top K Frequent Elements/347. Top K Frequent Elements.md b/347. Top K Frequent Elements/347. Top K Frequent Elements.md new file mode 100644 index 0000000..a073345 --- /dev/null +++ b/347. Top K Frequent Elements/347. Top K Frequent Elements.md @@ -0,0 +1,363 @@ +347. Top K Frequent Elements + +- 入力: 整数を格納した配列```nums```, 自然数```k``` +- 出力: ```nums```に格納されている整数のうち、出現頻度がトップ```k```位までの整数を格納した配列(頻度順に並べる必要なし) +- 条件 + - ```nums```の要素数は1以上10^5以下 + - n=10^5のとき、O(nlogn)のアルゴリズムの実行に必要なおおよそのステップ数は(10^5)log(10^5)=(10^5)(5+5log5)<2*10^6で抑えられる + - C++が毎秒処理できるステップ数が0.1G ~ 1Gステップ/sで、pythonはその100倍遅い1M ~ 10Mステップ/s + - よって、pythonによるO(nlogn)のアルゴリズムの実行時間の雑な見積もりは0.2秒 ~ 2秒 + - ```nums```に格納される整数の値は-10^4以上10^4以下 + - 与えらえる```k```は```nums```内の一意な要素の総数を上回らない + - 解は一意であることが保証されている:トップ2までを答えるときに2位タイの数が2つある、といったケースは無し +- follow up: 時間計算量がO(nlogn)よりも良い方法がある + +# step 1 +- 発想: 選挙の開票 + - ある団体で新たにk人の理事を選出するために団体の構成員全員を投票者かつ候補者とする選挙が行われた。私はこの選挙の開票作業を担当する。 + - 方針: ホワイトボードを用意して、得票者ごとの得票数を記録していく。 + - 実装: 辞書を用意して、登場した整数をキー、登場頻度を値とする。 + +## 1-1: collections.Counter() +- collectionsモジュールのCounterクラスを使う。\ +https://docs.python.org/3.14/library/collections.html#collections.Counter.most_common +- Counterのcpythonソース\ +https://github.com/python/cpython/blob/main/Lib/collections/__init__.py#L602 + - ```mapping[elem] = mapping_get(elem, 0) + 1```でキーが既出かどうかによらずカウントできる。この方法を使えば自力でカウンターを実装できそう。 + - ソート方法1 kの指定なし:sorted(); Tim sortを使用しているので時間計算量はO(nlogn)。ただしネイティブコードで走るので高速。https://docs.python.org/3.13/howto/sorting.html#sort-stability-and-complex-sorts\ +```sorted(self.items(), key=_itemgetter(1), reverse=True)``` + - ソート方法2 kの指定あり:heapq.nlargest(); heapを使っているので時間計算量はO(nlogk)。ただしheapqをimportする必要あり。\ +```heapq.nlargest(n, self.items(), key=_itemgetter(1))``` +- 時間計算量: O(n + mlogk), (m: ユニークな整数の総数) +- 空間計算量: O(m) +```python3 +import collections +import heapq +from typing import List + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + frequency = Counter(nums) + top_k_numbers = list(dict(frequency.most_common(k))) + return top_k_numbers + +``` + +## 1-2: dict + sorted() +- collections.Counerのcpythonソースを参考に実装。 +- heapq.nlargestを使わなくても遅くなさそう。 +- 時間計算量: O(n + mlogm), (m: ユニークな整数の総数) +- 空間計算量: O(m) +```python3 +from typing import List +from operator import itemgetter + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + frequency = {} + for number in nums: + frequency[number] = frequency.get(number, 0) + 1 + top_k_items = sorted(frequency.items(), key=itemgetter(1), reverse=True)[:k] + top_k_numbers = list(dict(top_k_items)) + return top_k_numbers + +``` + +# step 2 +- https://discord.com/channels/1084280443945353267/1235829049511903273/1245555256360697949 + - いきなりcollections.Counterを検討するのは違和感があるとのこと。私も他の実装方法を十分に検討していなかった。 +- https://discord.com/channels/1084280443945353267/1183683738635346001/1185972070165782688 + - quick sort, quick selectは常識に含まれる。 +- https://discord.com/channels/1084280443945353267/1227073733844406343/1231268645628416020 + - 逐次的に数値が送られてくると考えるとLRUも選択肢に入る。 +- https://github.com/fuga-98/arai60/pull/10#discussion_r1967591652 + - キーだけを並べる方法。sorted()に辞書を渡して引数keyにget()渡せば値で並んだキーのリストになる。気が付かなかった。 +- https://github.com/potrue/leetcode/pull/9/changes#r2083756353 + - itemgetterは常識ではなさそう。 +- https://github.com/potrue/leetcode/pull/9 + - bucket sortを使った方法。 +- https://github.com/irohafternoon/LeetCode/pull/11/changes/f07bc0eec037c1c6e72b16c38dc6fad3ca22a145#r2024946024 + - 辞書の名づけ。[key]_to_[value]を原則とすることを忘れないようにする。 +- https://github.com/naoto-iwase/leetcode/pull/9/changes#r2679765368 + - クイックソートの常識まとめ。 + +## 2-1: dict + 自作nlargest +- heapqのcpythonソースを参考に実装。\ +https://github.com/python/cpython/blob/main/Lib/heapq.py#L537 +```python3 +import heapq +from typing import List, Optional, Iterable, Callable, TypeVar, Any + +T = TypeVar("T") + +def klargest(iterable: Iterable[T], k: int, + key: Optional[Callable[[T], Any]] = None) -> List[T]: + if not isinstance(k, int) or k <= 0: + raise ValueError("k must be a natural number") + + if k == 1: + try: + return [max(iterable, key=key)] + except ValueError: + return [] + + try: + size = len(iterable) + except(TypeError, AttributeError): + pass + else: + if size <= k: + return sorted(iterable, key=key, reverse=True) + + if key is None: + key = lambda x: x + iterator = iter(iterable) + heap = [(key(element), order, element) + for order, element in zip(range(0, -k, -1), iterator)] + if not heap: + return [] + heapq.heapify(heap) + top = heap[0][0] + order = -k + for element in iterator: + value = key(element) + if value <= top: + continue + heapq.heapreplace(heap, (value, order, element)) + top = heap[0][0] + order -= 1 + heap.sort(reverse=True) + return [element for _, _, element in heap] + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + number_to_count = {} + for number in nums: + number_to_count[number] = number_to_count.get(number, 0) + 1 + return klargest(number_to_count.keys(), k=k, key=number_to_count.get) + +``` + +## 2-2: quick select +- 平均時間計算量: O(m) +- 最悪時間計算量: O(m^2) +- 空間計算量: O(m) +```python3 +from typing import List +import random + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + number_to_count = {} + for number in nums: + number_to_count[number] = number_to_count.get(number, 0) + 1 + + unique_numbers = list(number_to_count.keys()) + + def swap(list: List[int], index1: int, index2: int) -> None: + list[index1], list[index2] = list[index2], list[index1] + + def partition(left: int, right: int, pivot_index: int) -> int: + count_pivot = number_to_count[unique_numbers[pivot_index]] + swap(unique_numbers, pivot_index, right) + tail_less_than_pivot = left + for i in range(left, right): + count_i = number_to_count[unique_numbers[i]] + if count_i >= count_pivot: + continue + swap(unique_numbers, i, tail_less_than_pivot) + tail_less_than_pivot += 1 + swap(unique_numbers, tail_less_than_pivot, right) + pivot_index_final = tail_less_than_pivot + return pivot_index_final + + unique_size = len(unique_numbers) + target = unique_size - k + + if target <= 0: + return sorted(number_to_count, key=number_to_count.get, reverse=True) + + left = 0 + right = unique_size - 1 + while left <= right: + pivot_index_init = random.randint(left, right) + pivot_index_final = partition(left, right, pivot_index_init) + + if pivot_index_final == target: + break + if pivot_index_final < target: + left = pivot_index_final + 1 + continue + right = pivot_index_final - 1 + + top_k = unique_numbers[target:] + top_k.sort(key=number_to_count.get, reverse=True) + return top_k + +``` +## 2-3: bucket sort +- 時間計算量: O(n) +- 空間計算量: O(n) +```python +from collections import Counter +from typing import List + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + number_to_count = Counter(nums) + input_size = len(nums) + + count_to_number = [[] for _ in range(input_size + 1)] + for number, count in number_to_count.items(): + count_to_number[count].append(number) + + result = [] + for count in range(input_size, 0, -1): + numbers_bucket = count_to_number[count] + if not numbers_bucket: + continue + result.extend(numbers_bucket) + if len(result) >= k: + return result[:k] + return result[:k] + +``` + +# step 3 + +## 3-1: priority que +```python3 +import heapq +from collections import Counter +from typing import List + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + if not isinstance(k, int) or k<= 0: + raise ValueError("k mut be a positive integer") + if not nums: + return [] + + number_to_count = Counter(nums) + unique_numbers = list(number_to_count.keys()) + get_count = number_to_count.get + + if k == 1: + return [max(number_to_count, key=get_count)] + if k >= len(unique_numbers): + return sorted(number_to_count, key=get_count, reverse=True) + + first_k = unique_numbers[:k] + count_index_number = [ + (get_count(number), index, number) for index, number in zip(range(0, -k, -1), first_k) + ] + heapq.heapify(count_index_number) + top = count_index_number[0][0] + index = -k + rest_numbers = unique_numbers[k:] + for number in rest_numbers: + count = get_count(number) + if count <= top: + continue + heapq.heapreplace(count_index_number, (count, index, number)) + top = count_index_number[0][0] + index -= 1 + count_index_number.sort(reverse=True) + return [number for _, _, number in count_index_number] + +``` + +## 3-2: quick select +```python3 +import random +from collections import Counter +from typing import List + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + if not isinstance(k, int) or k <= 0: + raise ValueError("k must be a positive integer") + if not nums: + return [] + + number_to_count = Counter(nums) + unique_numbers = list(number_to_count.keys()) + get_count = number_to_count.get + + unique_size = len(unique_numbers) + + if k == 1: + return [max(number_to_count, key=get_count)] + if k >= unique_size: + return sorted(number_to_count, key=get_count, reverse=True) + + def swap(list: List[int], i: int, j: int) -> None: + list[i], list[j] = list[j], list[i] + + target = unique_size - k + def partition(left: int, right: int, pivot_index: int) -> int: + pivot = get_count(unique_numbers[pivot_index]) + swap(unique_numbers, pivot_index, right) + storage_pointer = left + for i in range(left, right): + count_i = get_count(unique_numbers[i]) + if count_i >= pivot: + continue + swap(unique_numbers, i, storage_pointer) + storage_pointer += 1 + swap(unique_numbers, storage_pointer, right) + return storage_pointer + + left = 0 + right = unique_size - 1 + while left <= right: + initial_index = random.randint(left, right) + sorted_index = partition(left, right, initial_index) + + if sorted_index == target: + break + if sorted_index < target: + left = sorted_index + 1 + continue + right = sorted_index - 1 + + top_k = unique_numbers[target:] + top_k.sort(key=get_count, reverse=True) + return top_k + +``` + +## 3-3: bucket sort +```python3 +from collections import Counter +from typing import List + +class Solution: + def topKFrequent(self, nums: List[int], k: int) -> List[int]: + if not isinstance(k, int) or k <= 0: + raise ValueError("k must be a positive integer") + if not nums: + return [] + + number_to_count = Counter(nums) + get_count = number_to_count.get + + if k == 1: + return [max(number_to_count, key=get_count)] + if k >= len(number_to_count): + return sorted(number_to_count, key=get_count, reverse=True) + + input_size = len(nums) + count_to_number = [[] for _ in range(0, input_size + 1)] + for number, count in number_to_count.items(): + count_to_number[count].append(number) + + result = [] + for count in range(input_size, 0, -1): + number_bucket = count_to_number[count] + result.extend(number_bucket) + if len(result) >= k: + break + return result[:k] + +```