The default dict in My following code returns non-default value.
from collections import defaultdict
from typing import List, AnyStr
import unittest
import pandas as pd
import enum
import logging
from copy import copy
from pprint import pprint
logging.basicConfig(level=logging.NOTSET)
class CategoryEncoder:
    """Given a list of set of categories, build up indexes corresponding to
    each category. Each category in each column is mapped to a unique value
    (here called indexes). """
    def __init__(self, all_categories:List[List[AnyStr]]):
        """
        Args:
            all_categories (List[List[Str]]): List of set of categories. eg.
            [
                ['A', 'B', 'C'],
                ['D', 'E'],
                ['F', 'G', 'H', 'I', 'J']
            ]
            The sets need not be of the same size
        """
        self.all_categories = all_categories
        # a list containing category's mapping to an index.
        # self.index[i] represents the mapping corresponding to i'th column in dataset(csv)
        self.index, self.offset, offset = [], 0, 0
        for column_index, categories in enumerate(all_categories):
            categories = set(categories)
            logging.debug(f'{column_index}, {offset}')
            self.index.append(defaultdict(lambda: offset))
            for index, category in enumerate(categories):
                self.index[column_index][category] = index + 1 + offset
            
            offset += len(categories) + 1
        
        self.offset = offset
    def get_index(self, column_index, word):
        return self.index[column_index][word]
    def __len__(self):
        return self.offset
if __name__ == '__main__':
    all_categories = [
                ['A', 'B', 'C'],
                ['D', 'E'],
                ['F', 'G', 'H', 'I', 'J']
            ]
    encoder = CategoryEncoder(all_categories)
    print('#', encoder.index[0]['#'])
    pprint(encoder.index)
with output:
DEBUG:root:0, 0
DEBUG:root:1, 4
DEBUG:root:2, 7
# 13
[defaultdict(<function CategoryEncoder.__init__.<locals>.<lambda> at 0x7fbef2dbe280>,
             {'#': 13,
              'A': 3,
              'B': 2,
              'C': 1}),
 defaultdict(<function CategoryEncoder.__init__.<locals>.<lambda> at 0x7fbedbee6ee0>,
             {'D': 5,
              'E': 6}),
 defaultdict(<function CategoryEncoder.__init__.<locals>.<lambda> at 0x7fbedbee6f70>,
             {'F': 11,
              'G': 10,
              'H': 12,
              'I': 9,
              'J': 8})]
Some details about the code (to make it easy to understand):
- The code takes strings grouped in lists, and assigns a unique id to each of them. For each group, it saves a unique id that is to be assigned to any other string that were not present. eg. for group ['A', 'B', 'C'], the id 0 is saved and is the default value of the defaultdict corresponding to that key. (see logging.debug(f'{column_index}, {offset}')line)
- self.index is a list that is initially empty, but at the end contains default dicts corresponding to each list in the argument 'all_categories'. It is printed at the end. See the output for better understanding.
The problem:
The logging.debug statement shows that the defaultdict stored at index 0 of the list self.index, has a default value 0, but when tried to access it print('#', encoder.index[0]['#']), it returns 13.
I am not sure why so.
