# Solidity Assembly Gas Optimized Keccak256

Solidity smart contracts often use `keccak256` to hash a number of input parameters; the standard way of calling this function looks like this:

```solidity
function getKeccak256(uint256 a, uint256 b, uint256 c) external pure returns(bytes32 result) {
    result = keccak256(abi.encode(a,b,c));
}
```

However it is possible to reduce the gas cost of computing the hash by ~42% using the following assembly:

```solidity
function getKeccak256(uint256 a, uint256 b, uint256 c) external pure returns(bytes32 result) {
    assembly {
        let mPtr := mload(0x40)
        mstore(mPtr, a)
        mstore(add(mPtr, 0x20), b)
        mstore(add(mPtr, 0x40), c)

        result := keccak256(mPtr, 0x60)
    }
}
```

Let's examine how and why this gas saving occurs!

### Prerequisite Knowledge

In order to understand the next section you'll need to have completed at least Section 1 of Updraft's [Assembly & Formal Verification course](https://updraft.cyfrin.io/courses/formal-verification) or already have equivalent knowledge of:

* [EVM Opcodes](https://www.evm.codes/)
    
* [EVM Stack Machine](https://faizannehal.medium.com/understanding-the-stack-based-architecture-of-evm-af45dc9819f2)
    
* Solidity's [Free Memory Pointer](https://docs.soliditylang.org/en/latest/internals/layout_in_memory.html)
    
* [Foundry's Debugger](https://book.getfoundry.sh/forge/debugger)
    

### Foundry Testing Code

To examine the execution of both functions we'll use the following stand-alone Foundry test contract:

```solidity
// SPDX-License-Identifier: MIT
pragma solidity 0.8.25;

import "forge-std/Test.sol";

// separate contracts for each implementation then accessing
// implementations via an interface in the test contract
// prevents the optimizer from being "too smart", helping
// to better approximate real-world execution
interface IGasImpl {
    function getKeccak256(uint256 a, uint256 b, uint256 c) external pure returns(bytes32 result);
}

contract GasImplNormal is IGasImpl {
    function getKeccak256(uint256 a, uint256 b, uint256 c) external pure returns(bytes32 result) {
        result = keccak256(abi.encode(a,b,c));
    }
}

contract GasImplAssembly is IGasImpl {
    function getKeccak256(uint256 a, uint256 b, uint256 c) external pure returns(bytes32 result) {
        assembly {
            let mPtr := mload(0x40)
            mstore(mPtr, a)
            mstore(add(mPtr, 0x20), b)
            mstore(add(mPtr, 0x40), c)

            result := keccak256(mPtr, 0x60)
        }
    }
}

// actual testing contract
contract GasDebugTest is Test {
    IGasImpl gasImplNormal   = new GasImplNormal();
    IGasImpl gasImplAssembly = new GasImplAssembly();
    uint256 a = 1;
    uint256 b = 2;
    uint256 c = 3;

    // forge test --match-contract GasDebugTest --debug test_GasImplNormal
    function test_GasImplNormal() external {
        bytes32 result = gasImplNormal.getKeccak256(a,b,c);
        assertEq(result, 0x6e0c627900b24bd432fe7b1f713f1b0744091a646a9fe4a65a18dfed21f2949c);
    }

    // forge test --match-contract GasDebugTest --debug test_GasImplAssembly
    function test_GasImplAssembly() external {
        bytes32 result = gasImplAssembly.getKeccak256(a,b,c);
        assertEq(result, 0x6e0c627900b24bd432fe7b1f713f1b0744091a646a9fe4a65a18dfed21f2949c);
    }
}
```

### Execution Trace - Assembly Version

Next we'll step through the assembly version using Foundry's debugger by executing this command: `forge test --match-contract GasDebugTest --debug test_GasImplAssembly`

We are concerned with the execution inside the `GasImplAssembly` contract:

* from the first `PUSH1(0x40)` after calldata has been loaded onto the stack and `JUMPDEST` has been called
    
* until `keccak256` has been called and the calculated hash put on the stack
    

The above execution started at PC 0x39 (57) until 0x4f (79) and used 341-224 = 117 gas. Some useful acronyms are:

* Free Memory Pointer Address (FMPA)
    
* Starting Next Free Memory Address (SNFMA)
    
* Next Free Memory Address (NFMA)
    

Let's step through the execution examining how the assembly version works:

```solidity
// push Free Memory Pointer Address (FMPA) onto stack
PUSH1(0x40) [Stack : 0x40, 0x03, 0x02, 0x01, 0x52, 0x05536b19]    
            [Memory: 0x40 = 0x80                             ]

// duplicate FMPA
DUP1        [Stack : 0x40, 0x40, 0x03, 0x02, 0x01, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80                                   ]

// load Starting Next Free Memory Address (SNFMA) by reading FMPA
MLOAD       [Stack : 0x80, 0x40, 0x03, 0x02, 0x01, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80                                   ]

// exchange 5th and 1st stack elements
// note: a common pattern follows to store input parameters in memory
SWAP4       [Stack : 0x01, 0x40, 0x03, 0x02, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80                                   ]

// duplicate SNFMA
DUP5        [Stack : 0x80, 0x01, 0x40, 0x03, 0x02, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80                                         ]

// store value of `a` into memory at SNFMA
MSTORE      [Stack : 0x40, 0x03, 0x02, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                ]

// push 0x20 onto stack (2nd param of first `add` call)
// this is an offset to calculate Next Free Memory Address (NFMA)
PUSH1(0x20) [Stack : 0x20, 0x40, 0x03, 0x02, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                      ]

// duplicate SNFMA
DUP5        [Stack : 0x80, 0x20, 0x40, 0x03, 0x02, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                            ]

// calculate NFMA by SNFMA + offset (0x80 + 0x20)
ADD         [Stack : 0xa0, 0x40, 0x03, 0x02, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                      ]

// exchange 4th and 1st stack elements
SWAP3       [Stack : 0x02, 0x40, 0x03, 0xa0, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                      ]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x40, 0x02, 0x03, 0xa0, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                      ]

// exchange 4th and 1st stack elements
SWAP3       [Stack : 0xa0, 0x02, 0x03, 0x40, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01                      ]

// store value of `b` into memory at NFMA
MSTORE      [Stack : 0x03, 0x40, 0x80, 0x52, 0x05536b19   ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x40, 0x03, 0x80, 0x52, 0x05536b19   ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02]

// duplicate SNFMA
DUP3        [Stack : 0x80, 0x40, 0x03, 0x80, 0x52, 0x05536b19]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02   ]

// calculate NFMA by SNFMA + offset (0x80 + 0x40)
ADD         [Stack : 0xc0, 0x03, 0x80, 0x52, 0x05536b19   ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02]

// store value of `c` into memory at NFMA
MSTORE      [Stack : 0x80, 0x52, 0x05536b19                            ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]

// note: all input parameters are now stored in memory

// push `size` parameter for call to keccak onto stack
PUSH1(0x60) [Stack : 0x60, 0x80, 0x52, 0x05536b19                      ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x80, 0x60, 0x52, 0x05536b19                      ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]

// call keccak256(offset, size)
KECCAK256   [Stack : result, 0x52, 0x05536b19                          ]
            [Memory: 0x40 = 0x80, 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]
```

The assembly version:

* stored the first input parameter `a` at the Starting Next Free Memory Address (SNFMA)
    
* calculated 2 additional Next Free Memory Addresses (NFMA) which it used to store input parameters `b, c`
    
* once this was completed there were only 3 more opcodes; 2 to prepare the `offset, size` input parameters and the final to call `keccak256`
    
* 20 opcodes in total were executed including 1 `MLOAD` and 3 `MSTORE`
    

### Execution Trace - Solidity Version

Having examined the assembly version we'll now turn to the solidity version using Foundry's debugger by executing this command: `forge test --match-contract GasDebugTest --debug test_GasImplNormal`

We are concerned with the execution inside the `GasImplNormal` contract:

* from the first `PUSH1(0x40)` after calldata has been loaded onto the stack and `JUMPDEST` has been called
    
* until `keccak256` has been called and the calculated hash put on the stack
    

The above execution started at PC 0x39 (57) until 0x6c (108) and used 428-224 = 204 gas; 74% more than the assembly version!

```solidity
// push Free Memory Pointer Address (FMPA) onto stack
PUSH1(0x40) [Stack : 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                             ]

// duplicate FMPA
DUP1        [Stack : 0x40, 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                   ]

// load Starting Next Free Memory Address (SNFMA) by reading FMPA
MLOAD       [Stack : 0x80, 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                   ]

// push 0x20 onto stack
// this is an offset to calculate Next Free Memory Address (NFMA)
PUSH1(0x20) [Stack : 0x20, 0x80, 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                         ]

// duplicate offset
DUP1        [Stack : 0x20, 0x20, 0x80, 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                               ]

// duplicate SNFMA
DUP3        [Stack : 0x80, 0x20, 0x20, 0x80, 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                                     ]

// calculate NFMA by SNFMA + offset (0x80 + 0x20)
ADD         [Stack : 0xa0, 0x20, 0x80, 0x40, 0x03, 0x02, 0x01, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                               ]

// exchange 7th and 1st stack elements
// note: a common pattern follows to store input parameters in memory
SWAP6       [Stack : 0x01, 0x20, 0x80, 0x40, 0x03, 0x02, 0xa0, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                               ]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x20, 0x01, 0x80, 0x40, 0x03, 0x02, 0xa0, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                               ]

// exchange 7th and 1st stack elements
SWAP6       [Stack : 0xa0, 0x01, 0x80, 0x40, 0x03, 0x02, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80                                               ]

// store value of `a` into memory at NFMA
MSTORE      [Stack : 0x80, 0x40, 0x03, 0x02, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                      ]

// note: normal version doesn't store first input at SNFMA so will have 
// to calculate one extra NFMA to store all inputs into memory compared 
// to assembly version

// duplicate SNFMA
DUP1        [Stack : 0x80, 0x80, 0x40, 0x03, 0x02, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                            ]

// duplicate FMPA
DUP3        [Stack : 0x40, 0x80, 0x80, 0x40, 0x03, 0x02, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                                  ]

// calculate NFMA by FMPA + offset (0x40 + 0x80)
ADD         [Stack : 0xc0, 0x80, 0x40, 0x03, 0x02, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                            ]

// exchange 5th and 1st stack elements
SWAP4       [Stack : 0x02, 0x80, 0x40, 0x03, 0xc0, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                            ]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x80, 0x02, 0x40, 0x03, 0xc0, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                            ]

// exchange 5th and 1st stack elements
SWAP4       [Stack : 0xc0, 0x02, 0x40, 0x03, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01                            ]

// store value of `b` into memory at NFMA
MSTORE      [Stack : 0x40, 0x03, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02   ]

// another offset used calculate NFMA
PUSH1(0x60) [Stack : 0x60, 0x40, 0x03, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02         ]

// duplicate the offset
DUP1        [Stack : 0x60, 0x60, 0x40, 0x03, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02               ]

// duplicate SNFMA
DUP5        [Stack : 0x80, 0x60, 0x60, 0x40, 0x03, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02                     ]

// calculate NFMA by SNFMA + offset (0x80 + 0x60)
ADD         [Stack : 0xe0, 0x60, 0x40, 0x03, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02               ]

// exchange 4th and 1st stack elements
SWAP3       [Stack : 0x03, 0x60, 0x40, 0xe0, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02               ]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x60, 0x03, 0x40, 0xe0, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02               ]

// exchange 4th and 1st stack elements
SWAP3       [Stack : 0xe0, 0x03, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02               ]

// store value of `c` into memory at NFMA
MSTORE      [Stack : 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19          ]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// note: all input parameters are now stored in memory

// duplicate FMPA
DUP1        [Stack : 0x40, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19    ]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// load Starting Next Free Memory Address (SNFMA) by reading FMPA
MLOAD       [Stack : 0x80, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19    ]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// duplicate SNFMA
// note: subsequent opcodes related to `abi.encode`
DUP1        [Stack : 0x80, 0x80, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03  ]

// duplicate SNFMA
DUP5        [Stack : 0x80, 0x80, 0x80, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03        ]

// subtract 2nd element from 1st
SUB         [Stack : 0x00, 0x80, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03  ]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x80, 0x00, 0x40, 0x60, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03  ]

// exchange 4th and 1st stack elements
SWAP3       [Stack : 0x60, 0x00, 0x40, 0x80, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03  ]

// calculate the `size` parameter required for
// the later keccak256 call
ADD         [Stack : 0x60, 0x40, 0x80, 0x80, 0x20, 0x6f, 0x05536b19    ]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// duplicate SNFMA
DUP3        [Stack : 0x80, 0x60, 0x40, 0x80, 0x80, 0x20, 0x6f, 0x05536b19]
            [Memory: 0x40 = 0x80, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03  ]

// stores the `size` parameter (0x60) at SNFMA
// which will be used later for the keccak256 call
MSTORE      [Stack : 0x40, 0x80, 0x80, 0x20, 0x6f, 0x05536b19                       ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// push SNFMA onto stack
PUSH1(0x80) [Stack : 0x80, 0x40, 0x80, 0x80, 0x20, 0x6f, 0x05536b19                 ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x40, 0x80, 0x80, 0x80, 0x20, 0x6f, 0x05536b19                 ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 4th and 1st stack elements
SWAP3       [Stack : 0x80, 0x80, 0x80, 0x40, 0x20, 0x6f, 0x05536b19                 ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// calculate NFMA
ADD         [Stack : 0x100, 0x80, 0x40, 0x20, 0x6f, 0x05536b19                      ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 2nd and 1st stack elements
SWAP1       [Stack : 0x80, 0x100, 0x40, 0x20, 0x6f, 0x05536b19                      ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 3rd and 1st stack elements
SWAP2       [Stack : 0x40, 0x100, 0x80, 0x20, 0x6f, 0x05536b19                      ]
            [Memory: 0x40 = 0x80, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// over-write value of FMPA with next NFMA
MSTORE      [Stack : 0x80, 0x20, 0x6f, 0x05536b19                                    ]
            [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// note: normal version had to calculate a second additional NFMA
// and update memory at FMPA; optimized version didn't do any of this

// duplicate SNFMA
DUP1        [Stack : 0x80, 0x80, 0x20, 0x6f, 0x05536b19                              ]
            [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// put value stored at SNFMA on stack
// this will be the `size` parameter for the keccak256
// call previously calculated
MLOAD       [Stack : 0x60, 0x80, 0x20, 0x6f, 0x05536b19                              ]
            [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 3rd and 1st stack elements
SWAP2       [Stack : 0x20, 0x80, 0x60, 0x6f, 0x05536b19                              ]
            [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// calculate memory address of first parameter; used
// as `offset` for keccak256
ADD         [Stack : 0xa0, 0x60, 0x6f, 0x05536b19                                    ]
            [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// call keccak256(offset, size)
KECCAK256   [result, 0x6f, 0x05536b19                                                ]
            [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]
```

### Gas Usage Comparison

In contrast to the assembly version, the solidity version:

* did **not** store the first input parameter `a` at the Starting Next Free Memory Address (SNFMA)
    
* instead it calculated 3 additional Next Free Memory Addresses (NFMA) which it used to store input parameters `a, b, c`
    
* once this was completed there were 22 more opcodes in contrast to the assembly version's 3 more opcodes
    
* calculated the `offset` parameter for `keccak256` which the assembly version hard-coded
    
* updated the Free Memory Pointer Address (FMPA) which the assembly version didn't need to, even though it didn't end up using the updated address (`0x100`)
    
* executed 48 opcodes in total compared to the 20 opcodes of the assembly version, including 3 `MLOAD` and 5 `MSTORE`
    
* used 204 gas instead of the assembly version's 117 resulting in a 74% increase to gas usage (204-117=87, (87/117)\*100 = 74)
    
* hence the assembly version provided a 42% gas saving compared to the solidity version (204-117=87, (87/204)\*100 = 42)
    

### Does Compiling With --via-ir Help?

Compiling with --via-ir offers modest gas improvements to the relevant code:

* assembly version uses 108 gas, down from 117 gas
    
* solidity version uses 195 gas, down from 204 gas
    

The relevant execution traces follow.

### Execution Trace - Assembly --via-ir Version

Execute this command: `forge test --match-contract GasDebugTest --debug test_GasImplAssembly --via-ir`

We are concerned with the execution inside the `GasImplAssembly` contract:

* from the first `CALLDATALOAD` of the first input parameter `a`
    
* until `keccak256` has been called and the calculated hash put on the stack
    

The above execution started at PC 0x32 (50) until 0x46 (70) and used 213-105 = 108 gas:

```solidity
// push value of `a` onto stack from calldata
CALLDATALOAD [Stack : 0x01]    
             [Memory:     ]

// push Starting Next Free Memory Address (SNFMA) onto stack
PUSH1(0x80)  [Stack : 0x80, 0x01]
             [Memory:           ]

// store value of `a` into memory at SNFMA
MSTORE       [Stack :            ]
             [Memory: 0x80 = 0x01]

// push offset on stack to read next input variable
PUSH1(0x24)  [Stack : 0x24       ]
             [Memory: 0x80 = 0x01]

// push value of `b` onto stack from calldata
CALLDATALOAD [Stack : 0x02       ]
             [Memory: 0x80 = 0x01]

// push Next Free Memory Address (NFMA) onto stack 
PUSH1(0xa0)  [Stack : 0xa0, 0x02 ]
             [Memory: 0x80 = 0x01]

// note: --via-ir was able to pre-compute the free
// memory addresses such that it doesn't have to calculate
// them at runtime but can just push them onto the stack

// store value of `b` into memory at Next Free Memory Address (NFMA)
MSTORE       [Stack :                         ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02]

// push offset on stack to read next input variable
PUSH1(0x44)  [Stack : 0x44                    ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02]

// push value of `c` onto stack from calldata
CALLDATALOAD [Stack : 0x03                    ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02]

// push Next Free Memory Address (NFMA) onto stack 
PUSH1(0xc0)  [Stack : 0xc0, 0x03              ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02]

// store value of `c` into memory at Next Free Memory Address (NFMA)
MSTORE       [Stack :                                      ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]

// note: all input parameters are now stored in memory

// push `size` parameter for call to keccak onto stack
PUSH1(0x60)  [Stack : 0x60                                 ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]

// push `offset` parameter for call to keccack onto stack
PUSH1(0x80)  [Stack : 0x80, 0x60                           ]
             [Memory: 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]

// call keccak256(offset, size)
KECCAK256   [Stack : result                               ]
            [Memory: 0x80 = 0x01, 0xa0 = 0x02, 0xc0 = 0x03]
```

### Execution Trace - Solidity --via-ir Version

Execute this command: `forge test --match-contract GasDebugTest --debug test_GasImplNormal --via-ir`

We are concerned with the execution inside the `GasImplNormal` contract:

* from the first `CALLDATALOAD` of the first input parameter `a`
    
* until `keccak256` has been called and the calculated hash put on the stack
    

The above execution started at PC 0x3a (58) until 0x72 (114) and used 318-123 = 195 gas:

```solidity
// push value of `a` onto stack from calldata
CALLDATALOAD [Stack : 0x01, 0xa0, 0x80, 0x00]    
             [Memory:                       ]

// duplicate Next Free Memory Address (NFMA) onto stack
DUP2         [Stack : 0x0a, 0x01, 0xa0, 0x80, 0x00]    
             [Memory:                             ]

// store value of `a` into memory at NFMA
MSTORE       [Stack : 0xa0, 0x80, 0x00]    
             [Memory: 0xa0 = 0x01     ]

// push offset on stack to read next input variable
PUSH1(0x24)  [Stack : 0x24, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01           ]

// push value of `b` onto stack from calldata
CALLDATALOAD [Stack : 0x02, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01           ]

// push Free Memory Pointer Address (FMPA) onto stack 
PUSH1(0x40)  [Stack : 0x40, 0x02, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01                 ]

// duplicate 0x80
DUP4         [Stack : 0x80, 0x40, 0x02, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01                       ]

// calculate Next Free Memory Address (NFMA)
ADD          [Stack : 0xc0, 0x02, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01                 ]

// store value of `b` into memory at NFMA
MSTORE       [Stack : 0xa0, 0x80, 0x00        ]    
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02] 

// push offset on stack to read next input variable
PUSH1(0x44)  [Stack : 0x44, 0xa0, 0x80, 0x00  ]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02]

// push value of `c` onto stack from calldata
CALLDATALOAD [Stack : 0x03, 0xa0, 0x80, 0x00  ]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02]

// push offset to calculate NFMA
PUSH1(0x60)  [Stack : 0x60, 0x03, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02    ]

// duplicate 0x80
DUP4         [Stack : 0x80, 0x60, 0x03, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02          ]

// calculate NFMA
ADD          [Stack : 0xe0, 0x03, 0xa0, 0x80, 0x00]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02    ]

// store value of `c` into memory at NFMA
MSTORE       [Stack : 0xa0, 0x80, 0x00                     ]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// note: all input parameters are now stored in memory

// push 0x60 which will be saved to memory then later
// used as the `size` parameter for keccak256 call
PUSH1(0x60)  [Stack : 0x60, 0xa0, 0x80, 0x00               ]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// note: subsequent opcodes related to `abi.encode`

// duplicate memory address to save previously pushed `size` parameter
DUP3         [Stack : 0x80, 0x60, 0xa0, 0x80, 0x00         ]
             [Memory: 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// save the `size` parameter into memory
MSTORE       [Stack : 0xa0, 0x80, 0x00                                  ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// used to calculate NFMA
PUSH1(0x80)  [Stack : 0x80, 0xa0, 0x80, 0x00                            ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// used to calculate NFMA
DUP3         [Stack : 0x80, 0x80, 0xa0, 0x80, 0x00                      ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// calculates NFMA; value used in LT and GT comparison
// then saved later on to FMPA
ADD          [Stack : 0x100, 0xa0, 0x80, 0x00                           ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 3rd and 1st stack elements
SWAP2        [Stack : 0x80, 0xa0, 0x100, 0x00                           ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// used for LT comparison
DUP1         [Stack : 0x80, 0x80, 0xa0, 0x100, 0x00                     ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// used for LT comparison
DUP4         [Stack : 0x100, 0x80, 0x80, 0xa0, 0x100, 0x00              ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// 0x100 < 0x80 ? false
LT           [Stack : 0x00, 0x80, 0xa0, 0x100, 0x00                     ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// used for GT comparison
PUSH8(0xffffffffffffffff)
             [Stack : 0xffffffffffffffff, 0x00, 0x80, 0xa0, 0x100, 0x00 ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// duplicate
DUP5         [Stack : 0x100, 0xffffffffffffffff, 0x00, 0x80, 0xa0, 0x100, 0x00]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03      ]

// 0x100 > 0xffffffffffffffff ? false
GT           [Stack : 0x00, 0x00, 0x80, 0xa0, 0x100, 0x00               ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// 0x00 or 0x00 = 0x00
OR           [Stack : 0x00, 0x80, 0xa0, 0x100, 0x00                     ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// not sure why this gets pushed here
PUSH1(0x76)  [Stack : 0x76, 0x00, 0x80, 0xa0, 0x100, 0x00               ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// jump ? false
JUMPI        [Stack : 0x80, 0xa0, 0x100, 0x00                           ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// not sure why this gets pushed here
PUSH1(0x20)  [Stack : 0x20, 0x80, 0xa0, 0x100, 0x00                     ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 5th and 1st stack elements
SWAP4        [Stack : 0x00, 0x80, 0xa0, 0x100, 0x20                     ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// remove top element 0x00
POP          [Stack : 0x80, 0xa0, 0x100, 0x20                           ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// duplicate 
DUP3         [Stack : 0x100, 0x80, 0xa0, 0x100, 0x20                    ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// prepare to over-write FMPA with previously
// calculated NFMA
PUSH1(0x40)  [Stack : 0x40, 0x100, 0x80, 0xa0, 0x100, 0x20              ]
             [Memory: 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// over-write value of FMPA with next NFMA
// FMPA was not being used though?
MSTORE       [Stack : 0x80, 0xa0, 0x100, 0x20                                         ]
             [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// load `size` parameter for keccak256 call
MLOAD        [Stack : 0x60, 0xa0, 0x100, 0x20                                         ]
             [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// exchange 2nd and 1st stack elements
SWAP1        [Stack : 0xa0, 0x60, 0x100, 0x20                                         ]
             [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]

// call keccak256(offset, size)
KECCAK256    [Stack : result, 0x100, 0x20                                             ]
             [Memory: 0x40 = 0x100, 0x80 = 0x60, 0xa0 = 0x01, 0xc0 = 0x02, 0xe0 = 0x03]
```

### Formal Verification Using Halmos

The following function can be added to the existing `GasDebugTest` contract in order to formally verify using [Halmos](https://github.com/a16z/halmos) that both the assembly and solidity versions produce the same output:

```solidity
// halmos --match-contract GasDebugTest
function check_GasImplEquivalent(uint256 a1, uint256 b1, uint256 c1) external {
    bytes32 resultNormal   = gasImplNormal.getKeccak256(a1,b1,c1);
    bytes32 resultAssembly = gasImplAssembly.getKeccak256(a1,b1,c1);

    assertEq(resultNormal, resultAssembly);
}
```

Execute the formal verification using: `halmos --match-contract GasDebugTest`

### Additional Resources

* [Solady EfficientHashLib.sol](https://github.com/Vectorized/solady/blob/main/src/utils/EfficientHashLib.sol)
