DataStructure

Tao Zou

2024-08-19

Basic Data Structures

Stack

class StackError(Exception):
    def __init__(self, message):
        super().__init__(message)

class Stack:
  def __init__(self):
    self.items = []
  
  def isEmpty(self):
    return self.items == []
  
  def push(self, item):
    self.items.append(item)
    
  def pop(self):
    return self.items.pop()
  
  def peek(self):
    if self.isEmpty():
      raise StackError(f"peek cannot deal with empty stack.")
    else:
      return self.items[-1]
  
  def size(self):
    return len(self.items)

Queue

class Queue:
    def __init__(self):
        self.items = []

    def isEmpty(self):
        return self.items == []
    
    def enqueue(self, item):
        self.items.append(item)
    
    def dequeue(self):
        return self.items.pop(0)
    
    def size(self):
        return len(self.items)

Notice: When I use enqueue(), the time complexity is \(O(1)\); When I use dequeue(), the time complexity is \(O(n)\).

If I want both of them to be \(O(1)\), then use collections.deque

from collections import deque

queue = deque()

queue.append('item1')
queue.append('item2')

first_item = queue.popleft()

print(first_item) # item1
print(queue) # deque(['item2'])

Deque

class Deque:
    def __init__(self):
        self.items = []

    def isEmpty(self):
        return self.items == []

    def addFront(self, item):
        self.items.insert(0, item)
        
    def addRear(self, item):
        self.items.append(item)

    def removeFront(self):
        return self.items.pop(0)
    
    def removeRear(self):
        return self.items.pop()

    def size(self):
        return len(self.items)

Notice: When I use addFront() or removeFront(), the time complexity is \(O(1)\); When I use addRear() or removeRear(), the time complexity is \(O(n)\).

Binary tree

Construct a binary tree.

class BinaryTree:
    def __init__(self, rootObj):
        self.key = rootObj  # String
        self.left = None  # BinaryTree
        self.right = None  # BinaryTree
        self.parent = None  # BinaryTree

    def insert_left(self, newNode):
        if self.left is None:
            node = BinaryTree(newNode)
            self.left = node
            node.parent = self
        else:
            node = BinaryTree(newNode)
            node.left = self.left
            self.left.parent = node
            self.left = node
            node.parent = self

    def insert_right(self, newKey):
        if self.right is None:
            node = BinaryTree(newKey)
            self.right = node
            node.parent = self
        else:
            node = BinaryTree(newKey)
            node.left = self.right
            self.right.parent = node
            self.right = node
            node.parent = self

Pre-order traversal, In-order traversal, Post-order traversal.

def traverse_tree(node, traverse_type, key_list=None):
    if key_list is None:
        key_list = []
    if node is None:
        return
    if traverse_type == 'pre':  # pre-order traversal
        key_list.append(node.key)
        traverse_tree(node.left, traverse_type, key_list)
        traverse_tree(node.right, traverse_type, key_list)
    elif traverse_type == 'in':  # in-order traversal
        traverse_tree(node.left, traverse_type, key_list)
        key_list.append(node.key)
        traverse_tree(node.right, traverse_type, key_list)
    elif traverse_type == 'post':  # post-order traversal
        traverse_tree(node.left, traverse_type, key_list)
        traverse_tree(node.right, traverse_type, key_list)
        key_list.append(node.key)

    return key_list

Binary heap

Values increase from root to leaves.

class BinaryHeap:
    def __init__(self):
        self.heapList = [0]
        self.size = 0

    def perc_up(self, i):
        while i // 2 > 0:
            if self.heapList[i] < self.heapList[i//2]:
                self.heapList[i], self.heapList[i//2] = self.heapList[i//2], self.heapList[i]
                i = i // 2
            else:
                break

    def perc_down(self, i):  # i is the index of heapList
        while i * 2 <= self.size:
            if i * 2 + 1 > self.size:
                target_index = i * 2
            else:
                if self.heapList[i * 2] < self.heapList[i * 2 + 1]:
                    target_index = i * 2
                else:
                    target_index = i * 2 + 1
            if self.heapList[i] > self.heapList[target_index]:
                self.heapList[i], self.heapList[target_index] = self.heapList[target_index], self.heapList[i]
                i = target_index
            else:
                break

    def insert(self, item):
        self.heapList.append(item)
        self.size += 1
        self.perc_up(self.size)

    def build_heap(self, alist):
        self.heapList = [0] + alist
        self.size = len(alist)
        i = self.size // 2
        while i > 0:  # heapList[1] = alist[0]
            self.perc_down(i)
            i -= 1

    def pop_min(self):
        min_value = self.heapList[1]
        self.heapList[1] = self.heapList.pop()
        self.size -= 1
        self.perc_down(1)

        return min_value

create a binary heap(build_heap())

\(n\) is the array’s length including the initial \(0\).

Time complexity for building a binary heap(worst case): \(O(n)\) \[ \begin{aligned} &2^0\cdot(\log_2^n-1)+2^1\cdot(\log_2^n-2)+\cdots+2^{\log_2^n-2}\cdot\big{(}\log_2^n-(\log_2^n-1)\big{)}\\ &=log_2^n\cdot(2^0+2^1+\cdots+2^{\log_2^n-2})-\big{(}1\cdot2^0+2\cdot2^1+3\cdot2^2+\cdots+(\log_2^n-1)\cdot2^{\log_2^n-2}\big{)}\\ &=n-\log_2^n-1 \end{aligned} \]

myHeap = BinaryHeap()
myHeap.build_heap([10, 9, 7, 8, 3, 4, 1, 7, 1, 1, 2])

retrieve the min value(pop_min())

Time complexity for popping all values: \(O(n\log_2^n)\)

print(myHeap.pop_min(), myHeap.pop_min(), myHeap.pop_min(), myHeap.pop_min())
## 1 1 1 2

Graph-Prim

Prim算法用于找到一个有权无向图的最小生成树。

输入:起始节点、邻接矩阵

输出:依次加入生成树的的节点

# Prim Algorithm
M = float('inf')
nodeSet = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
distanceMatrix = [[0, 28, M, M, M, 10, M],
                  [28, 0, 16, M, M, M, 14],
                  [M, 16, 0, 12, M, M, M],
                  [M, M, 12, 0, 22, M, 18],
                  [M, M, M, 22, 0, 25, 24],
                  [10, M, M, M, 25, 0, M],
                  [M, 14, M, 18, 24, M, 0]]

def prim(nodeSet: list, distanceMatrix: list, initNode: str) -> list:
    # 真实值转化为索引
    nodeSetIndex = list(range(len(nodeSet)))
    initNodeIndex = nodeSet.index(initNode)

    setU = [initNodeIndex]
    setV = {}
    for nodeIndex in nodeSetIndex:
        if nodeIndex != initNodeIndex:
            setV[nodeIndex] = distanceMatrix[initNodeIndex][nodeIndex]

    while len(setU) < len(nodeSet):
        closestNodeIndex = min(setV, key=setV.get)
        setU.append(closestNodeIndex)

        setV.pop(closestNodeIndex)
        for nodeIndex in setV:
            if distanceMatrix[closestNodeIndex][nodeIndex] < setV[nodeIndex]:
                setV[nodeIndex] = distanceMatrix[closestNodeIndex][nodeIndex]

    return [nodeSet[nodeIndex] for nodeIndex in setU]  # 索引转化为真实值

print(prim(nodeSet, distanceMatrix, 'A'))
## ['A', 'F', 'E', 'D', 'C', 'B', 'G']

Graph-Dijkstra

Dijkstra算法用于找到一个图中起始节点到其他所有节点的最短路径。

输入:起始节点、邻接矩阵

输出:记录与其他所有节点最短路径长度的数组、各个节点途径点数组

# Dijkstra Algorithm
M = float('inf')
nodeSet = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
distanceMatrix = [[0, 28, M, M, M, 10, M],
                  [28, 0, 16, M, M, M, 14],
                  [M, 12, 0, 12, M, M, M],
                  [M, M, 16, 0, 22, M, 18],
                  [M, M, M, 22, 0, 25, 24],
                  [10, M, M, M, 25, 0, M],
                  [M, 14, M, 18, 24, M, 0]]

def dijskstra(nodeSet: list, distanceMatrix: list, initNode: str) -> tuple:
    # 真实值转化为索引
    nodeSetIndex = list(range(len(nodeSet)))
    initNodeIndex = nodeSet.index(initNode)

    # 初始化必要数据结构
    setU = [initNodeIndex]
    setV = {}  # nodeIndex: (distance from initNodeIndex to nodeIndex)
    for nodeIndex in nodeSetIndex:
        if nodeIndex != initNodeIndex:
            setV[nodeIndex] = distanceMatrix[initNodeIndex][nodeIndex]
    distance, path = [0] * len(nodeSetIndex), [None] * len(nodeSetIndex)
    for index, dist in enumerate(distanceMatrix[initNodeIndex]):
        if dist < M and dist != 0:
            distance[index] = dist
            path[index] = initNodeIndex

    while len(setU) < len(nodeSetIndex):
        closestNodeIndex = min(setV, key=setV.get)  # 最近节点索引
        closestDistance = setV[closestNodeIndex]  # 最近节点对应的距离

        if closestDistance >= M:
            break
        distance[closestNodeIndex] = closestDistance

        setU.append(closestNodeIndex)
        setV.pop(closestNodeIndex)

        # 更新setV、记录途径点
        for nodeIndex in setV:
            distViaCloestNode = closestDistance + distanceMatrix[closestNodeIndex][nodeIndex]
            if (distViaCloestNode < setV[nodeIndex]):
                # 如果,以上述最近节点为中间节点,比到待选节点的已记录的最短距离更近
                setV[nodeIndex] = distViaCloestNode
                path[nodeIndex] = closestNodeIndex

    return distance, [nodeSet[nodeIndex] if nodeIndex is not None else None for nodeIndex in path]  # 索引转化为真实值

distance, path = dijskstra(nodeSet, distanceMatrix, 'A')
print(distance, path)
## [0, 28, 44, 56, 35, 10, 42] [None, 'A', 'B', 'C', 'F', 'A', 'B']

Search And Sort

Bubble sort

\(n\): length of array

Time complexity: \(O(n^2)\)

One exchange needs three assignment operations.

def bubble_sort(nums):
    sortcompleted = False
    i = 0
    while i < len(nums) - 1 and not sortcompleted:
        sortcompleted = True
        for j in range(0, len(nums)-1-i):
            if nums[j] > nums[j+1]:
                nums[j], nums[j+1] = nums[j+1], nums[j]
                sortcompleted = False
        i += 1

    return nums

Insertion sort

\(n\): length of array

Time complexity: \(O(n^2)\)

Without exchange operation and less assignment operations, it is faster than Bubble Sort.

def intertion_sort(nums):
    for i in range(len(nums)-1):
        j = i
        temp = nums[j+1]
        while temp < nums[j] and j >= 0:
            nums[j+1] = nums[j]
            j -= 1
        nums[j+1] = temp

    return nums

Merge sort

pass sliced array into function

There will be \(\log_2^n\) binary split. In each split, there will be \(n\) times for array slice and \(n\) times for sort.

Time complexity: \(O\big{(}(n+n)log_2^n\big{)}=O(n\log_2^n)\)

Space complexity: \(O(n+\frac{n}{2^1}+\frac{n}{2^2}+\cdots+1)=O(2n-1)\)

def merge(left_array, right_array):
    merged_array = []
    i, j = 0, 0
    while i < len(left_array) and j < len(right_array):
        if left_array[i] < right_array[j]:
            merged_array.append(left_array[i])
            i += 1
        else:
            merged_array.append(right_array[j])
            j += 1
    if i < len(left_array):
        merged_array.extend(left_array[i:])
    elif j < len(right_array):
        merged_array.extend(right_array[j:])
    return merged_array

def merge_sort(nums):
    if len(nums) == 1:
        return nums

    left_array = merge_sort(nums[:len(nums) // 2])
    right_array = merge_sort(nums[len(nums) // 2:])
    sorted_merged_array = merge(left_array, right_array)

    return sorted_merged_array

pass array’s index into function

Time complexity: \(O\big{(}nlog_2^n\big{)}=O(n\log_2^n)\) reduce \(n\) times for slicing array.

Space complexity: \(O(n+2\log_2^n)=O(n)\) only pass \(2\) indices into recursion function.

def merge(left_array, right_array):
    merged_array = []
    i, j = 0, 0
    while i < len(left_array) and j < len(right_array):
        if left_array[i] < right_array[j]:
            merged_array.append(left_array[i])
            i += 1
        else:
            merged_array.append(right_array[j])
            j += 1
    if i < len(left_array):
        merged_array.extend(left_array[i:])
    elif j < len(right_array):
        merged_array.extend(right_array[j:])
    return merged_array

def merge_sort(nums, begin, end):
    if end - begin == 0:
        return [nums[begin]]
    left_array = merge_sort(nums, begin, (begin+end) // 2)
    right_array = merge_sort(nums, (begin+end) // 2 + 1, end)
    sorted_merged_array = merge(left_array, right_array)

    return sorted_merged_array

nums = [4, 1, 6, 0, 9, 1, 1, 1, -1]
sorted_result = merge_sort(nums, 0, len(nums)-1)

Basic Problems

Binary search for ordered sequence

no duplication

Binary search for ordered sequence without duplicate elements.

def binary_search(nums, target):
    """
    :nums: ordered number sequence without duplicate elements
    :target: target number
    :return: target number index or -1
    """
    length_nums = len(nums)
    begin, end = 0, length_nums-1
    while begin <= end:
        mid = (end + begin) // 2
        if target == nums[mid]:
            return mid  # find it
        elif target == nums[begin]:
            return begin  # find it
        elif target == nums[end]:
            return end  # find it
        elif nums[begin] < target < nums[mid]:
            end = mid - 1
            continue
        elif nums[mid] < target < nums[end]:
            begin = mid + 1
            continue
        else:  # target > num[-1] or target < num[0]
            return -1
    return -1

nums = [1, 2, 4, 5, 6, 7]
nums = [3]
target_index = binary_search(nums, 2)
print(target_index)

\(n\): length of array

Time complexity: \(O(\log_2^n)\)

Space complexity: \(O(1)\)

no duplication (recursion)

Binary search for ordered sequence without duplicate elements.

def binary_search(nums, target):
    if len(nums) == 0:
        return False
    else:
        mid = len(nums) // 2
        if nums[mid] == target:
            return True
        elif nums[mid] > target:
            return binary_search(nums[:mid], target)
        else:
            return binary_search(nums[mid+1:], target)

\(n\): length of array

Time complexity: \(O(\log_2^n)\)

Worst-case space complexity: \(O(n+\frac{n}{2}+\frac{n}{4}+\cdots+1)=O(n)\)

duplication

Binary search for ordered sequence with duplicate elements.

\(n\): length of array

\(k\): the max length of cluster

  • If \(n\) is enormous and \(k\) is a constant: Use binary search to find target number index and then traverse around the cluster.

\[O(\log_2^n+k)=O(\log_2^n)\]

def binary_search(nums, target):
  """
  :nums: ordered number sequence without duplicate elements
  :target: target number
  :return: target number index or -1
  """
  length_nums = len(nums)
  begin, end = 0, length_nums-1

  while begin <= end:
    mid = (end + begin) // 2

    if target == nums[mid]:
      left, right = mid, mid
      while nums[left] == target:
        left -= 1
      while nums[right] == target:
        right += 1
      return left+1, right-1
    elif target == nums[begin]:
      left, right = begin, begin
      while nums[right] == target:
        right += 1
      return left, right-1
    elif target == nums[end]:
      left, right = end, end
      while nums[left] == target:
        left -= 1
      return left+1, right
    elif nums[begin] < target < nums[mid]:
      end = mid - 1
      continue
    elif nums[mid] < target < nums[end]:
      begin = mid + 1
      continue
    else:  # target > num[-1] or target < num[0]
      return -1
  return -1
  • If \(n\) and \(k\) are both enormous: Use binary search to find target number index and then use binary search once more (or twice more) to find the two ends of a cluster.

\[O(\log_2^n+\log_2^k)\]

def find_left_right_index(begin, mid, end, target):
  """
  Property1: All target numbers are between num[begin] and num[end]
  """
  left_index, right_index = None, None
  if nums[mid] == target:
    left, right = begin, mid
    while left <= right:
      m = (left + right) // 2
      if nums[m] < target:
        left = m + 1
      else:
        if m-1 == -1 or nums[m-1] != target:
          left_index = m
          break
        else:
          right = m - 1
    left, right = mid, end
    while left <= right:
      m = (left + right) // 2
      if nums[m] > target:
        right = m - 1
      else:
        if m+1 == len(nums) or nums[m+1] != target:
          right_index = m
          break
        else:
          left = m + 1
    return left_index, right_index
  elif nums[begin] == target:
    left, right = begin, mid
    while left <= right:
      m = (left + right) // 2
      if nums[m] > target:
        right = m - 1
      else:
        if m+1 == len(nums) or nums[m+1] != target:
          right_index = m
          break
        else:
          left = m + 1
    return begin, right_index
  elif nums[end] == target:
    left, right = mid, end
    while left <= right:
      m = (left + right) // 2
      if nums[m] < target:
        left = m + 1
      else:
        if m-1 == -1 or nums[m-1] != target:
          left_index = m
          break
        else:
          right = m - 1
    return left_index, end
def binary_search(nums, target):
  """
  :nums: ordered number sequence without duplicate elements
  :target: target number
  :return: target number index or -1
  """
  length_nums = len(nums)
  begin, end = 0, length_nums-1

  while begin <= end:
    mid = (end + begin) // 2

    if target == nums[mid]:
      return find_left_right_index(begin, mid, end, target)
    elif target == nums[begin]:
      return find_left_right_index(begin, mid, end, target)
    elif target == nums[end]:
      return find_left_right_index(begin, mid, end, target)
    elif nums[begin] < target < nums[mid]:
      end = mid - 1
      continue
    elif nums[mid] < target < nums[end]:
      begin = mid + 1
      continue
    else:  # target > nums[-1] or target < nums[0]
      return -1
  return -1

Find the only duplicate or the only unique number

Find the only duplicate number

slow pointer moves by one point every time. Set \(s_1\) be the distance traveled by slow pointer.

fast pointer moves by two points every time. Set \(s_2\) be the distance traveled by fast pointer.

Set \(s\) be the distance from the beginning point to the loop beginning point.

Two pointers move from the begin point at the same time, and they will finally encounter at the point in the loop. Set \(l\) be the distance from the loop beginning point to the meeting point. And set \(K\in N\).

Therefore,

\[s_1=s+K\cdot L+l\]

\[\frac{s_2-s_1}{L}=\frac{s_1}{L}=K+\frac{s+l}{L}\in N^*\]

Let one of the pointers go back to the beginning point and the other stay still. Then both of the pointers move by one point every time, then are bound to meet at loop beginning point.

任务

设有\(n+1\)个元素,每个元素的值是位于\(1\)\(n\)之间的连续的整数。这些元素中,仅有一个值会出现重复元素。找出这个重复值。

将元素值设为索引,可以将该数组变成一个链表,既然有重复数字,那必然可以形成一个环。

def findDepulicate(nums):
    # 慢指针p1和快指针p2位于起始点,p1每次走一步,p2每次走两步,直至相遇。
    # p1和p2分别表示列表中的索引
    p1, p2 = 0, 0
    while True:
        p1 = nums[p1]
        p2 = nums[nums[p2]]
        if p1 == p2:
            break
    # 慢指针p1回到起始点,此后p1和p2每次都只走一步,直到再次在循环起始点相遇
    # 相遇点的索引即为重复的元素
    p1 = 0
    while True:
        p1 = nums[p1]
        p2 = nums[p2]

        if p1 == p2:
            return p1

print(findDepulicate([2, 4, 3, 1, 3]))
## 3

Find the only unique numbers

异或运算:两二进制数的相同位次上的数相同则得0,相异则得1。

异或运算性质:

  1. \(a \oplus a=0\)

  2. \(a \oplus 0=a\)

  3. 满足交换律

所以:\[a \oplus b \oplus c \oplus a \oplus c=(a \oplus a) \oplus (c \oplus c) \oplus b=b\]

任务

给定一个数组,数组中除了一个元素有奇数次重复外,其他所有元素均为偶数次重复。找到这个奇数次重复的数。

def findUnique(nums):
    if len(nums) == 1:
        return nums[0]
    else:
        target_num = nums[0]
        for i in range(1, len(nums)):
            target_num ^= nums[i]
        return target_num

nums = [1, 3, 5, 7, 3, 5, 0, 0, 1]
print(findUnique(nums))
## 7

DP

0/1 package problem

The storage complexity of this method is \(O(NM)\), where \(N\) is the number of input item and the \(M\ (M\leq\text{capacity})\) is the Volume.

def knapsack(values, weights, capacity):
    n = len(values)
    # 初始化DP表,大小为(n+1) x (capacity+1),全部填充为0
    dp = [[0 for _ in range(capacity + 1)] for _ in range(n + 1)]

    # 填充DP表
    for i in range(1, n + 1):
        for w in range(1, capacity + 1):
            # 如果当前物品i的重量大于当前容量w,则不装入背包
            if weights[i-1] > w:
                dp[i][w] = dp[i-1][w]
            else:
                # 选择不装入或装入背包中,取价值较大的
                dp[i][w] = max(dp[i-1][w], dp[i-1][w-weights[i-1]] + values[i-1])

    # 返回DP表最后一个元素,即为背包能装下的所有物品的最大价值
    return dp[n][capacity]

# 示例
weights = [2, 3, 6, 5]
values = [6, 3, 5, 4]
capacity = 9
print(knapsack(values, weights, capacity))
## 11

0/1 package problem with less storage complexity

The storage complexity of this optimized method is \(O(M)\), where the \(M\ (M\leq\text{capacity})\) is the Volume.

def knapsack(values, weights, capacity):
    N = len(values)
    dp = [0] * (capacity + 1)
    for i in range(N):
        for j in range(capacity, 0, -1):
            if weights[i] > j:
                # 放不下
                break
            else:
                # 能放下
                dp[j] = max(dp[j - weights[i]] + values[i], dp[j])
    return dp[-1]

# 示例
weights = [2, 3, 6, 5]
values = [6, 3, 5, 4]
capacity = 9
print(knapsack(values, weights, capacity))
## 11