%
% Code for a Self-Organizing Map (SOM) in Matlab.
%
% As a model of the brain, the model features the use of 
% cortical columns and receptive fields
%
% (c) Frdric Dandurand, 2011
%

% clean up figures and memory
clear all
close all

addpath('../library'); % include SOM code library

rand('seed', 2)  % use the same seed, to make results replication possible

useVisibility = true;  % set to "true" to use experimental values of human letter visibility (see Stevens & Grainger 2003)
% or "false" for constant (ideal or perfect) visibility

learningRate = 0.5;  % SOM learning rate: magnitude of adjustment of input weights towards the input values
factorVector = [1 0.3 0.12 0.05];  % defines the magnitude of the contribution of neighbours at distance 1, 2 and 3

useDistance = true;  % boolean value indicating the method used to select the
% best matching unit (SOM winner). Set to "true" to select winner 
% with minimal distance to input vector, and false to select the most active unit.

distanceLearningThreshold = 1e-15;  % if distance is used,  
% the word is considered learned well-enough when shorter than this threshold, and thus is ignored.

activationLearningThreshold = 0.8;  % if activation is used, 
% the word is considered learned well-enough when some SOM unit is active above this threshold, and thus is ignored.

debug = false; % set to false for fast vector-based processing, and set to 
% true to also compute and compare with slower item-based processing 
% using the more readable for loops and compare with faster vector based 
% computations. In debug mode, the program is also more verbose.

parallelCodings = true; % false for a bank of 7 letters, true for a bank double in size, in which
% the word is presented both as left- and right-aligned: 
% ex: TABLE####TABLE
%     WITH######WITH
%     SILENCESILENCE
% if false, words are presented once, in the center of the slot
% ex.: #TABLE#
%      #WITH##
%      SILENCE
%

maxIter = 10000;     % number of SOM learning interations (epochs)
mapSize = [15 10];  % Dimension of the SOM map onto which inputs are projected
temperatureFactor = 0.0; % temperature (if > 0) determines an exponential decay of learning

CORTICAL_COLS = 1;   % number of cortical columns in each input, here, a single value
receptiveFieldSize = [inf inf];   % size of the receptive field that a unit on the SOM map can see on the input layer. 
% Set to INF for a standard SOM in which units on the map see all inputs

maxWordSize = 7;   % maximal length of words that can be used in the dictionary
letterCount = 26;  % number of accentuated letters (in English: 26)

% initial connection weights are set within the following range:
%[meanInitialWeight - weightRange] and [meanInitialWeight + weightRange]
weightRange = 0.2;
meanInitialWeight = 0.2; 

% input values are binary: absence and presence of a feature
% the numerical value indicating absence will be
% between 0 and 2*noiseLevel  (with an average of noiseLevel set as lowTarget)
% and the presence of a feature by a value between 1-2*noiseLevel and 1
% (with an average value of 1-noiseLevel, set as highTarget)
% Note that if noiseLevel = 0, then targets for absence and presence of a
% feature are 0 and 1, respectively
noiseLevel = 0.01; % add uniform noise to inputs with range [-NoiseLevel,+NoiseLevel].
lowTarget = noiseLevel;
highTarget = 1 - noiseLevel;

% the simulation contains a list of words that are skipped
% Since some words are very frequent (e.g., "a", "the")
% they are (too) often selected.
% To solve this, words that are already well-learnt are skipped for 
% SKIP_ITER iterations. After that, they are once again considered for
% training
SKIP_ITER = 25;

% Accept only odd values for receptive field sizes
if (rem(receptiveFieldSize, [2 2]) == 0)
    error('Receptive field size must be composed of odd (i.e., not even) values')
end

% directory where to store results
root = '../../results/LettersToSubLexicalUnits/';

if (useDistance)
    root = [root, 'useMinDistance/'];
else
    root = [root, 'useMaxActivation/'];
end

if(useVisibility)
    root = [root, 'realisticVisibility/'];
else
    root = [root, 'constantVisibility/'];
end

mkdir(root);

% load dictionary data, including word frequencies, and word probabilities
[wordList,wordFreq,wordProb] = textread('../../data/celex.txt','%s %d %f','headerlines',1);

fileName = [root, 'Results.txt'];
netFileName = [root, 'workspace.mat'];

% Indexes in visibilityMatrix: 
% 1 = word length
% 2 = letter pos in string
% 3 = fixated letter
visibilityMatrix = ones(maxWordSize,maxWordSize,maxWordSize);
if (useVisibility)
    visibilityMatrix = zeros(maxWordSize,maxWordSize,maxWordSize);
    % Visibility data format
    % # $1    letter position in string
    % # $2    fixated letter
    % # $3    position letter relative to fixation
    % # $4    recognition probability = visibility
    visibilityDat = textread('../../data/visibility data.txt');

    for i=1:size(visibilityDat,1)
        wordSize = visibilityDat(i,1);
        letterPos = visibilityDat(i,2);
        fixPos = visibilityDat(i,3);
        visibilityMatrix(wordSize, letterPos, fixPos) = visibilityDat(i,4);
    end
end

if (debug)
    % in debug mode, plot content of visiblity matrix
    for wordSize=7:-1:1
        for fixPos = 1:wordSize
            plot(visibilityMatrix(wordSize, 1:wordSize, fixPos));
            ylabel('Recognition probability')
            xlabel('Letter within word position');
            title([num2str(wordSize), '-letter word, fixation at position ', num2str(fixPos)]);
            pause
        end
    end
end

% to be considered as coding for a given input, a connection weight must
% be larger than the threshold value.
activeThreshold = 0.5 * mean(visibilityMatrix(visibilityMatrix>0));

dim1 = letterCount;       % number of rows of lower (input) map
if (parallelCodings)
    dim2 = 2*maxWordSize; % number of cols of lower (input) map
else
    dim2 = maxWordSize;   % number of cols of lower (input) map
end

dim3 = mapSize(1);  % number of rows in the upper (SOM) map
dim4 = mapSize(2);  % number of cols in the upper (SOM) map

% location of active connections are indicated by values of one (1), and units not connected = 0
weightsMask = getWeightsMask(dim1, dim2, CORTICAL_COLS, dim3, dim4, receptiveFieldSize);  

% set random connection weights between the units connected on the
% input and the SOM maps, weights vary randomly between
% [meanInitialWeight-weightRange] and [meanInitialWeight+weightRange]  
weights = meanInitialWeight + 2 * weightRange * (rand(size(weightsMask)) - 1);
weights = weights .* weightsMask;

skippedWords = hashtable;   % cache to keep words skipped

iter = 1;

% training will proceed for maxIter iterations
while (iter <= maxIter)
    % randomly select a word for training
    word = char(getword(wordList, wordProb, maxWordSize));

    % check if word is in cache of words to skip
    if(iskey(skippedWords,word))
        %yes, how many iterations left?
        iterToGo = get(skippedWords, word);
        if (iterToGo > 0)
            skippedWords = put(skippedWords,word,iterToGo-1);  % update number of iterations to skip
            skip = true;
        else
            % been skipped for enough iterations, process it this time
            skippedWords = remove(skippedWords, word);
            skip = false;
            disp(['   Skipped words: Removing "', word, '"'])
        end
    else
        % word not in skipped words cache so use it
        skip = false;
    end

    if (~skip)
        % word is not skipped, so process it
        wordSize = size(word, 2);

        % Fill buffer with a new word
        if (parallelCodings)
            % Under this coding, a buffer of twice the maximal lenght
            % admissible for words is presented with the selected word at two
            % different locations: left-aligned and right-aligned
            buffer = zeros(1,2*maxWordSize);     % letter buffer has twice the longuest length admissible
            visibility = zeros(1,2*maxWordSize); % initalize the visibility buffer for corresponding locations

            fixationPos = ceil(wordSize/2);   % fixation is assumed at the center of the word

            % Process left-aligned word
            firstLetterPos1 = 1;              % left-aligned word starts at position 1 in buffer
            buffer(firstLetterPos1:firstLetterPos1+wordSize-1) = word;   % put all letters of word in buffer
            visibility(firstLetterPos1:firstLetterPos1+wordSize-1) = visibilityMatrix(wordSize,1:wordSize,fixationPos); % set visibility values at corresponding positions

            % Process right-aligned word
            firstLetterPos2 = 2*maxWordSize - size(word,2) + 1;   % right-aligned word starts at 2*maxWordSize - size(word,2) + 1...
            buffer(firstLetterPos2:end) = word;     % and ends at the end of the buffer
            visibility(firstLetterPos2:end) = visibilityMatrix(wordSize,1:wordSize,fixationPos);  % set visibility values at corresponding positions
        else
            % normal coding (at the center of the buffer)
            buffer = zeros(1,maxWordSize);     % letter buffer needs to contain only one time the word of maxWordSize
            visibility = zeros(1,maxWordSize); % initializing corresponding visibility buffer
            fixationPos = ceil(wordSize/2);    % fixating the central letter
            firstLetterPos = ceil(maxWordSize/2) - floor(wordSize/2);  % position of the first letter (maxWordSize is the center of the buffer, 
            % and wordSize/2 is half the length of the word, so that half
            % falls before the center and half after, that is, the word is
            % aligned in the center of the letter buffer

            buffer(firstLetterPos:firstLetterPos+wordSize-1) = word;  % filling letter buffer
            visibility(firstLetterPos:firstLetterPos+wordSize-1) = visibilityMatrix(wordSize,1:wordSize,fixationPos);  % filling visibility buffer
        end

        % encode letter buffer into the input pattern 
        % and apply the correct visibility value (as a multiplication)
        % Since each buffer position is now represented by letterCount bits
        % (e.g. 26) top indicate the presence of each of the possible
        % letter at the buffer position, visibility matrix needs to be repeated 
        % (repmat) for each bit coding for letters (that is, letterCount)
        pat = encode(buffer, letterCount, lowTarget, highTarget, debug) .* repmat(visibility, letterCount, 1);
        nonEmptyIndexes = find(pat > 0);
        
        % add noise to input values (only where letters are located)
        pat(nonEmptyIndexes) = pat(nonEmptyIndexes) + noiseLevel * rand(size(nonEmptyIndexes));

        % Let the winner learn
        if (useDistance)
            % compute distances matrix, minimal distance, and coordinates 
            % of unit with minimal distance to the input vector
            [distances, minDistVal, winnerRow, winnerCol] = computeSOMDistances(pat, weights, weightsMask, debug);
            
            % Process only if distance is larger than threshold
            % a small value indicates that the word is already well-coded
            % for and can be skipped on the next few epochs
            % to skip the word, it is included in the words-to-be-skipped
            % cache
            if (minDistVal < distanceLearningThreshold)
                % distance is small -- word does not need to be learned
                % insert in skipped words cache
                skippedWords = put(skippedWords, word, SKIP_ITER);
                disp(['   Skipped words: Adding "', word, '"'])
            end
        else
            % verify that the unit with the smallest distance is also
            % the most active
            [activations maxActVal,winnerRow,winnerCol] = computeSOMActivations(pat, weights, weightsMask, debug);

            % Process only if activation is smaller than threshold
            % a large value indicates that the word is already well-coded
            % for and can be skipped on the next few epochs
            % to skip the word, it is included in the words-to-be-skipped
            % cache
            if (maxActVal > activationLearningThreshold)
                % activation of winner unit is large enough -- word does not need to be learned
                % insert in skipped words cache
                skippedWords = put(skippedWords, word, SKIP_ITER);
                disp(['   Skipped words: Adding "', word, '"'])
            end
        end
        
        updatedWeights = computeUpdatedWeights(weights,weightsMask,pat,...
            dim1,dim2,CORTICAL_COLS,dim3,dim4,...
            winnerRow,winnerCol,iter,learningRate,temperatureFactor,factorVector,debug);

        % use the updated weights in the next iteration
        weights = updatedWeights;

        % at every 500 iterations, display progress of training
        if (mod(iter,500) == 0)
            mapStr = ['Words processed = ', num2str(iter), char(10)];

            hash = hashtable;
            % Estimating what the map encodes
            % by seeking which connection weights are above threshold
            for outX=1:dim3
                for outY=1:dim4
                    % for each SOM unit
                    posContent = '';
                    for slot=1:dim2
                        % for each slot position, seek which letters are 
                        % above activeThreshold
                        letters = find(weights(:,slot,1,outX,outY) > activeThreshold)';
                        if(size(letters,2) == 0)
                            % if none is found, plot a dot (.)
                            posContent = [ posContent, '.'];
                        elseif(size(letters,2) == 1)
                            % if only one is found, plot the letter
                            posContent = [ posContent, char(letters+'a'-1)];
                        else
                            % if more than one is found, plot all letters
                            % between parentheses
                            posContent = [ posContent, '(', char(letters+'a'-1), ')'];
                        end
                    end
                    % posContent is a string that uniquely identifies which letters at different
                    % positions of the buffer are coded for by the current unit, in terms of letter
                    % so we can use posContent as a key in a hashtable to 
                    % count how many units code : what units code for, and how
                    % many units do

                    % the hashtable used posContent as a key, and stores the
                    % number of repetitions of the same letters was found on the SOM map

                    % first, check if the hashtable already contrains the key
                    % (posContent)
                    if (iskey(hash,posContent))
                        % yes, so update the number of units that code for this
                        % same combination of letters
                        repeatCount = get(hash,posContent) + 1;
                    else
                        % no, so this is the first time we encounter this
                        % combination
                        repeatCount = 1;
                    end
                    % create or update the number of repeats found for
                    % posContent as a key
                    hash = put(hash,posContent,repeatCount);

                    % separate units by a *
                    mapStr  = [mapStr, posContent, '*'];
                end
                % end of row, include a carriage return
                mapStr = [mapStr, char(10)];
            end
            % end of map, include a carriage return
            mapStr = [mapStr, char(10)];

            % retrieve all keys (letters coded) and values (number of
            % repetitions)
            k = keys(hash);
            v = values(hash);

            % format all this content of the hashtable as follows:
            % how many times x letters found
            % e.g., 2 x ...o...
            for i = 1 : size (k, 2)
                val = v(:,i);
                mapStr = [mapStr, num2str(val{1}), ' x ', char(k(:,i)), char(10)];
            end
            mapStr = [mapStr, char(10)];

            disp(mapStr)  % show the result on screen

            % and save result to file (appending)
            fid = fopen(fileName, 'at');
            fprintf(fid, '%s', mapStr);
            fclose(fid);
            save(netFileName);
        elseif (mod(iter,100) == 0)
            % show progress of training every 100 iterations
            disp([num2str(iter), ' patterns processed' ])
        end

        iter = iter + 1;
    end
end
