{"version":1,"pages":[{"id":"zO9yG4JUGiXfbUqGO3sZ","title":"README","pathname":"/transformer_evolution_paper","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"hEPnCcp3vleaBRsyDhyD","title":"数学符号","pathname":"/transformer_evolution_paper/notations","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"BtNizbd8EZwx2JdMQkJS","title":"Act","pathname":"/transformer_evolution_paper/act","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"4cbVf08MSdOA5nDvt39I","title":"A survey on recently proposed activation functions for Deep Learning","pathname":"/transformer_evolution_paper/act/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Act"}]},{"id":"nyOaV4CiSUWASU0rkXj4","title":"Arch","pathname":"/transformer_evolution_paper/arch","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"DKAe6tmxNT8smD2fdTxL","title":"Supplementary Material Implementation and Experiments for GAU-based Model","pathname":"/transformer_evolution_paper/arch/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"K11j991uIvw4Y4ai1u7u","title":"MetaFormer is Actually What You Need for Vision","pathname":"/transformer_evolution_paper/arch/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"8aXXuMBHtxXNxBiTG0Rm","title":"Deeper vs Wider A Revisit of Transformer Configuration","pathname":"/transformer_evolution_paper/arch/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"HubU0JGLerwAWnBGXam9","title":"Perceiver General Perception with Iterative Attention","pathname":"/transformer_evolution_paper/arch/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"6Jr43O5f720gubP0Tc8B","title":"General-purpose, long-context autoregressive modeling with Perceiver AR","pathname":"/transformer_evolution_paper/arch/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"XOilPMfbImq734liJ3Dv","title":"Hierarchical Transformers Are More Efficient Language Models","pathname":"/transformer_evolution_paper/arch/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"5DfW75es4g0CuV735x3w","title":"Branchformer: Parallel MLP-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding","pathname":"/transformer_evolution_paper/arch/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"302uphvmVupSaKyn0cSE","title":"Generalization through Memorization: Nearest Neighbor Language Models","pathname":"/transformer_evolution_paper/arch/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Arch"}]},{"id":"5IOBjC4W1uXtwsFQJgdR","title":"FFN","pathname":"/transformer_evolution_paper/ffn","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"TUbJpR3GDZXfgjOz3XfW","title":"Large Memory Layers with Product Keys","pathname":"/transformer_evolution_paper/ffn/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"QjS4epL0AFNqgxK2eGtQ","title":"Transformer Feed-Forward Layers Are Key-Value Memories","pathname":"/transformer_evolution_paper/ffn/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"r3axVNwzkfAJpEQey5KL","title":"GLU Variants Improve Transformer","pathname":"/transformer_evolution_paper/ffn/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"A1HYPs7McenXjmuZOzhq","title":"Simple Recurrence Improves Masked Language Models","pathname":"/transformer_evolution_paper/ffn/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"0v3mZOyIY5MkZagNvm85","title":"Pay Attention to MLPs","pathname":"/transformer_evolution_paper/ffn/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"yNbk7VoZhvokRlc9TgIo","title":"S2-MLP Spatial-Shift MLP Architecture for Vision","pathname":"/transformer_evolution_paper/ffn/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"WO0o4KXGm0MVy5zLUwBc","title":"S2-MLPv2 Improved Spatial-Shift MLP Architecture for Vision","pathname":"/transformer_evolution_paper/ffn/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"GdAQD9ef4hrrNbOSFSvg","title":"HyperMixer An MLP-based Green AI Alternative to Transformers","pathname":"/transformer_evolution_paper/ffn/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"rF4dmBgNuByR7eHSR1rz","title":"DeFINE: DEep Factorized INput Token Embeddings for Neural Sequence Modeling & DeLighT: Deep and Light-weight Transformer","pathname":"/transformer_evolution_paper/ffn/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"VSFIaAL8IW4Op7NBO2FK","title":"When Shift Operation Meets Vision Transformer: An Extremely Simple Alternative to Attention Mechanism","pathname":"/transformer_evolution_paper/ffn/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"pTwZepb8OmVkU3p8GTXy","title":"Sparse MLP for Image Recognition: Is Self-Attention Really Necessary?","pathname":"/transformer_evolution_paper/ffn/011","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"FFN"}]},{"id":"INAQwCyqHruT9pd2FpQB","title":"Head","pathname":"/transformer_evolution_paper/head","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"eXVRNWkvwQmVjJAdAuqg","title":"Multi-Head Attention Collaborate Instead of Concatenate","pathname":"/transformer_evolution_paper/head/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Head"}]},{"id":"1vDJQ1I7Ez5ADTpfgBZp","title":"Fast Transformer Decoding: One Write-Head is All You Need","pathname":"/transformer_evolution_paper/head/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Head"}]},{"id":"526nfO88TwocJvapsWT4","title":"Memory","pathname":"/transformer_evolution_paper/memory","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"qwVFJ3X74wNa1ZPhjsjA","title":"Compressive Transformers for Long-Range Sequence Modelling","pathname":"/transformer_evolution_paper/memory/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"dck0QhbfwHWiYCj0z58q","title":"Memformer The Memory-Augmented Transformer","pathname":"/transformer_evolution_paper/memory/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"17LsHTpFZifHL132AkIo","title":"Memory Transformer","pathname":"/transformer_evolution_paper/memory/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"Qr25kLyrTG3E6jm7kww6","title":"Do Transformers Need Deep Long-Range Memory","pathname":"/transformer_evolution_paper/memory/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"uOll9zFRn9gIIkI7Gp5R","title":"LaMemo Language Modeling with Look-Ahead Memory","pathname":"/transformer_evolution_paper/memory/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"H6Lc5ebEPq27V4MCMbWt","title":"GMAT Global Memory Augmentation for Transformers","pathname":"/transformer_evolution_paper/memory/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"UYp5zIfXbCk4XmWKkYXj","title":"Block-Recurrent Transformers","pathname":"/transformer_evolution_paper/memory/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"lj7bIJ92nSLAqlJ4NWxG","title":"Augmenting Self-attention with Persistent Memory","pathname":"/transformer_evolution_paper/memory/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"CRuHN0XKtP1K8gZNnYkb","title":"Recurrent Memory Transformer","pathname":"/transformer_evolution_paper/memory/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"csSWFYheRkYNPNUzRWBr","title":"Memorizing Transformers","pathname":"/transformer_evolution_paper/memory/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"vtqxFetWmDMzB0Aeu6oZ","title":"Scaling Transformer to 1M tokens and beyond with RMT","pathname":"/transformer_evolution_paper/memory/011","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"nv2cFpzkq1zGIxon2FXY","title":"Adapting Language Models to Compress Contexts","pathname":"/transformer_evolution_paper/memory/012","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Memory"}]},{"id":"s0TmGyXECQUHvEFiaNqC","title":"MHA","pathname":"/transformer_evolution_paper/mha","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"uIgqhhbtADeOZebD9c9E","title":"FFT","pathname":"/transformer_evolution_paper/mha/fft","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"}]},{"id":"UVdNQKTSzU80w2yx7uWL","title":"Fourier Neural Operator for Parametric Partial Differential Equations","pathname":"/transformer_evolution_paper/mha/fft/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"FFT"}]},{"id":"ra9t2lKvzzcRNnkEhPbH","title":"Global Filter Networks for Image Classification","pathname":"/transformer_evolution_paper/mha/fft/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"FFT"}]},{"id":"MlD7LAu0SOsbfWD3t0sG","title":"Adaptive Fourier Neural Operators: Efficient Token Mixers for Transformers","pathname":"/transformer_evolution_paper/mha/fft/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"FFT"}]},{"id":"SJWFdrzXcBFRM8vnhlNp","title":"FNet: Mixing Tokens with Fourier Transforms","pathname":"/transformer_evolution_paper/mha/fft/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"FFT"}]},{"id":"fRxx7xRDUYGiqmcj67hb","title":"LocalGlobal","pathname":"/transformer_evolution_paper/mha/localglobal","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"}]},{"id":"peCoqPTrZHiqGP9xRATB","title":"CrossFormer: A Versatile Vision Transformer Hinging on Cross-scale Attention","pathname":"/transformer_evolution_paper/mha/localglobal/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"LocalGlobal"}]},{"id":"yOKK8k9vObUiPbsQhDXc","title":"Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and Interpretable Visual Understanding","pathname":"/transformer_evolution_paper/mha/localglobal/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"LocalGlobal"}]},{"id":"LGBGk5k7GhipMtkyoVym","title":"Neighborhood Attention Transformer","pathname":"/transformer_evolution_paper/mha/localglobal/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"LocalGlobal"}]},{"id":"zY2CRI8vxdZsM1HvbhZ0","title":"FMMformer: Efficient and Flexible Transformer via Decomposed Near-field and Far-field Attention","pathname":"/transformer_evolution_paper/mha/localglobal/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"LocalGlobal"}]},{"id":"ZLsWhvysLpiH7bumtJL0","title":"Adaptive Attention Span in Transformers","pathname":"/transformer_evolution_paper/mha/localglobal/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"LocalGlobal"}]},{"id":"6CmJy236rD68qgPiN6Va","title":"CoLT5: Faster Long-Range Transformers with Conditional Computation","pathname":"/transformer_evolution_paper/mha/localglobal/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"LocalGlobal"}]},{"id":"99XBpXOTvUzHg9PM8jp3","title":"MatrixMethod","pathname":"/transformer_evolution_paper/mha/matrixmethod","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"}]},{"id":"5oJn39zJ3UNWhzXraFiS","title":"Skyformer Remodel Self-Attention with Gaussian Kernel and Nyström Method","pathname":"/transformer_evolution_paper/mha/matrixmethod/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"MatrixMethod"}]},{"id":"n1qYyvkyyoqp4bMLwKso","title":"Is Attention Better Than Matrix Decomposition","pathname":"/transformer_evolution_paper/mha/matrixmethod/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"MatrixMethod"}]},{"id":"i7hjfbheIlA4w7FJRLCH","title":"RightProduct","pathname":"/transformer_evolution_paper/mha/rightproduct","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"}]},{"id":"MOXE7VHdfi7PgUgctiQN","title":"Kronecker Attention Networks","pathname":"/transformer_evolution_paper/mha/rightproduct/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"YEnmnWuIV4YfOedJrQZ1","title":"An Attention Free Transformer","pathname":"/transformer_evolution_paper/mha/rightproduct/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"HwGPOU9aEi3UpczooIh4","title":"Transformer with Fourier Integral Attentions","pathname":"/transformer_evolution_paper/mha/rightproduct/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"Tjng9C8Qbch9OOb8Lz7o","title":"Linear Complexity Randomized Self-attention Mechanism","pathname":"/transformer_evolution_paper/mha/rightproduct/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"WnYtrKEqvwFR9F0lEN1i","title":"UFO-ViT: High Performance Linear Vision Transformer without Softmax","pathname":"/transformer_evolution_paper/mha/rightproduct/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"lXjDxGiIqwkTmK997VuT","title":"XCiT: Cross-Covariance Image Transformers","pathname":"/transformer_evolution_paper/mha/rightproduct/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"VBtuyjjCzMlEqn2MTrBV","title":"SimpleTRON: Simple Transformer with O(N) Complexity","pathname":"/transformer_evolution_paper/mha/rightproduct/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"jgm1hMmVg87c3j33HESF","title":"A Dot Product Attention Free Transformer","pathname":"/transformer_evolution_paper/mha/rightproduct/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"1I5TWjeiDBonIohBOvHm","title":"On Learning the Transformer Kernel","pathname":"/transformer_evolution_paper/mha/rightproduct/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"iLKd2ieV3BxlyKOyxqlz","title":"Momentum Transformer: Closing the Performance Gap Between Self-attention and Its Linearization","pathname":"/transformer_evolution_paper/mha/rightproduct/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"RightProduct"}]},{"id":"sPZsFRyVynZgVGI90Hbp","title":"SparseOrLowRank","pathname":"/transformer_evolution_paper/mha/sparseorlowrank","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"}]},{"id":"WjCclSWPFgR8XaBIombS","title":"Explicit Sparse Transformer: Concentrated Attention Through Explicit Selection","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"vqxrTbkJhDmeZxqmsCuH","title":"Scatterbrain: Unifying Sparse and Low-rank Attention Approximation","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"cPVPtN2GmJRyTDZdIIQG","title":"Sparse Factorization of Large Square Matrices","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"EYsMldR4qcsrbKaAn6wE","title":"Blockwise Self-Attention for Long Document Understanding","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"Lu9RAe7Ps1J3QOqiXoeJ","title":"H-Transformer-1D: Fast One-Dimensional Hierarchical Attention for Sequences","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"SGwvhuHdYNX7ukzmrFDS","title":"ChunkFormer: Learning Long Time Series with Multi-stage Chunked Transformer","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"rU5ihqq7B7OeHz1X5QYK","title":"Enhancing the Locality and Breaking the Memory Bottleneck of Transformer on Time Series Forecasting","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"AQs2orizQ8p67GKA7o36","title":"Fast Transformers with Clustered Attention","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"F7MJex3tddIxq1b9YhuF","title":"Long-Short Transformer: Efficient Transformers for Language and Vision","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"cadqiYfBAJaXcRmmw8xp","title":"LongT5: Efficient Text-To-Text Transformer for Long Sequences","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"M2MV8opBYPyiECFk1CS1","title":"Luna: Linear Unified Nested Attention","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/011","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"hCmetP4CUDKds7XPM5IA","title":"Memory-efficient Transformers via Top-k Attention","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/012","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"AAdCNZgkvkGk2xFqx9As","title":"Separable Self-attention for Mobile Vision Transformers","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/013","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"X3lCd8MXZYfR7xrPoWnr","title":"Simple Local Attentions Remain Competitive for Long-Context Tasks","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/014","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"kbzp1d4iHXOz4EkZDUZh","title":"You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling","pathname":"/transformer_evolution_paper/mha/sparseorlowrank/015","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"SparseOrLowRank"}]},{"id":"J8rWdSuaO1zy5vSurQBq","title":"Others","pathname":"/transformer_evolution_paper/mha/others","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"}]},{"id":"Xg1cUgGjDgHHOAud8Q40","title":"Synthesizer: Rethinking Self-Attention in Transformer Models","pathname":"/transformer_evolution_paper/mha/others/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"OoMlDCXUmQLTZxVOVmH5","title":"Transformer Dissection: A Unified Understanding of Transformer's Attention via the Lens of Kern","pathname":"/transformer_evolution_paper/mha/others/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"Iyyk7HDXBTSUResm2VFt","title":"Combiner Full Attention Transformer with Sparse Computation Cost","pathname":"/transformer_evolution_paper/mha/others/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"WbqRx5ZCiJYLrjlMQHIT","title":"Ripple Attention for Visual Perception with Sub-quadratic Complexity","pathname":"/transformer_evolution_paper/mha/others/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"4xt9OqcwNobM6zZHhP9C","title":"Sinkformers: Transformers with Doubly Stochastic Attention","pathname":"/transformer_evolution_paper/mha/others/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"C7knc4wzqFjeWMQmN0G1","title":"SOFT: Softmax-free Transformer with Linear Complexity","pathname":"/transformer_evolution_paper/mha/others/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"biFeqKLmDoWaYWbPwKR6","title":"Value-aware Approximate Attention","pathname":"/transformer_evolution_paper/mha/others/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"Q9DpzL1WepgeRFtgy5Rg","title":"EL-Attention: Memory Efficient Lossless Attention for Generation","pathname":"/transformer_evolution_paper/mha/others/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"LbgH2PdPXDL3foZpp8s8","title":"Flowformer: Linearizing Transformers with Conservation Flows","pathname":"/transformer_evolution_paper/mha/others/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"RbhH2SSGiWLWgp6SWahT","title":"ETSformer: Exponential Smoothing Transformers for Time-series Forecasting","pathname":"/transformer_evolution_paper/mha/others/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"E9zHlP0E2AQpzbInFO82","title":"IGLOO: Slicing the Features Space to Represent Sequences","pathname":"/transformer_evolution_paper/mha/others/011","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"jUYCPI9GEwaobRlXOd3R","title":"Swin Transformer V2: Scaling Up Capacity and Resolution","pathname":"/transformer_evolution_paper/mha/others/012","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"F8iiZB0YyPYHDSzMrOMJ","title":"Skip-Attention: Improving Vision Transformers by Paying Less Attention","pathname":"/transformer_evolution_paper/mha/others/013","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"MHA"},{"label":"Others"}]},{"id":"jlLdIM06YPsMsSlDrHZ8","title":"Normalize_And_Residual","pathname":"/transformer_evolution_paper/normalize_and_residual","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"Pl6XlGGtrJGWtxuRb15x","title":"ReZero is All You Need Fast Convergence at Large Depth","pathname":"/transformer_evolution_paper/normalize_and_residual/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"tfhhjLnuPXZQvwoaMC0e","title":"Batch Normalization Biases Residual Blocks Towards the Identity Function in Deep Networks","pathname":"/transformer_evolution_paper/normalize_and_residual/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"Vz5OWsMWvWboExDs0YsM","title":"Improving Deep Transformer with Depth-Scaled Initialization and Merged Attention","pathname":"/transformer_evolution_paper/normalize_and_residual/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"NlMwylTKyG07Rk3nymPf","title":"RealFormer Transformer Likes Residual Attention","pathname":"/transformer_evolution_paper/normalize_and_residual/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"pSxTAKXTdkQEzNBFp58U","title":"On Layer Normalizations and Residual Connections in Transformers","pathname":"/transformer_evolution_paper/normalize_and_residual/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"HspPV42WcD21q7GiqxuH","title":"Transformers without Tears: Improving the Normalization of Self-Attention","pathname":"/transformer_evolution_paper/normalize_and_residual/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"168mGmU9LQyd3sbGUtvR","title":"Query-Key Normalization for Transformers","pathname":"/transformer_evolution_paper/normalize_and_residual/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"VV6H0eTU0svu4dat8s8k","title":"Understanding the difficulty of training transformers","pathname":"/transformer_evolution_paper/normalize_and_residual/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Normalize_And_Residual"}]},{"id":"YU2djMbUUpiWxN167aSX","title":"Pe","pathname":"/transformer_evolution_paper/pe","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"MBqAXx7tCPKb1sIwwtKX","title":"A Simple and Effective Positional Encoding for Transformers","pathname":"/transformer_evolution_paper/pe/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"FGztvuEpfX30NhtdvXLC","title":"DeBERTa Decoding-enhanced BERT with Disentangled Attention","pathname":"/transformer_evolution_paper/pe/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"Q8jfGMcMkvH6NDRh4ow2","title":"DecBERT Enhancing the Language Understanding of BERT with Causal Attention Masks","pathname":"/transformer_evolution_paper/pe/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"HZStAKTeWw8JcO7yAN5r","title":"Encoding word order in complex embeddings","pathname":"/transformer_evolution_paper/pe/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"tzSyiLKji0S4CTfITRcY","title":"Improve Transformer Models with Better Relative Position Embeddings","pathname":"/transformer_evolution_paper/pe/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"4TrYM2l4Xjzp0ucfweCg","title":"KERPLE Kernelized Relative Positional Embedding for Length Extrapolation","pathname":"/transformer_evolution_paper/pe/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"VtWn5jKVdMB25m4x9uwl","title":"PermuteFormer Efficient Relative Position Encoding for Long Sequences","pathname":"/transformer_evolution_paper/pe/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"uKxakLx7SYI0cXbixHDz","title":"Rethinking Positional Encoding in Language Pre-training","pathname":"/transformer_evolution_paper/pe/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"aBlpRBREzErYoRAJKUQQ","title":"Transformer-XL Attentive Language Models Beyond a Fixed-Length Context","pathname":"/transformer_evolution_paper/pe/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"pwdoWgBKBTb733o1myTy","title":"Translational Equivariance in Kernelizable Attention","pathname":"/transformer_evolution_paper/pe/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"IU295cYJAnUsKtsSx2Au","title":"Transformer Language Models without Positional Encodings Still Learn Positional Information","pathname":"/transformer_evolution_paper/pe/011","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"EwRm4clvBLJCwsUnDdYq","title":"Stable, Fast and Accurate: Kernelized Attention with Relative Positional Encoding","pathname":"/transformer_evolution_paper/pe/012","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"cL1p2VmnNAdPFamzRwbM","title":"Randomized Positional Encodings Boost Length Generalization of Transformers","pathname":"/transformer_evolution_paper/pe/013","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pe"}]},{"id":"609kOiHfcVp031kCdgHf","title":"Pretrain","pathname":"/transformer_evolution_paper/pretrain","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"Nh5jtHUwVaEifkAK6jta","title":"XLNet Generalized Autoregressive Pretraining for Language Understanding","pathname":"/transformer_evolution_paper/pretrain/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pretrain"}]},{"id":"9EGFgvKb9rtyS4bWnn7F","title":"Transcormer Transformer for Sentence Scoring with Sliding Language Modeling","pathname":"/transformer_evolution_paper/pretrain/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pretrain"}]},{"id":"t6AfFOQcdKw6JEKA50qj","title":"Optimus Organizing Sentences via Pre-trained Modeling of a Latent Space","pathname":"/transformer_evolution_paper/pretrain/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pretrain"}]},{"id":"oeqVCnIArdU0oODdeteu","title":"ELECTRA Pre-training Text Encoders as Discriminators Rather Than Generators","pathname":"/transformer_evolution_paper/pretrain/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pretrain"}]},{"id":"NT2S7IN7KvBv0qXYyuQG","title":"Cramming: Training a Language Model on a Single GPU in One Day","pathname":"/transformer_evolution_paper/pretrain/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Pretrain"}]},{"id":"rTu9NnRBWgYKYxt5ESIf","title":"Softmax","pathname":"/transformer_evolution_paper/softmax","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"NqbsYT21k6MHNtgL59j1","title":"Transformer with a Mixture of Gaussian Keys","pathname":"/transformer_evolution_paper/softmax/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Softmax"}]},{"id":"RX0TSixHDPglCHE8P3m1","title":"Normalized Attention Without Probability Cage","pathname":"/transformer_evolution_paper/softmax/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Softmax"}]},{"id":"rfi3oOJqtVub747KdmhJ","title":"Others","pathname":"/transformer_evolution_paper/others","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"rndJb6iDPwJZMWhl53oW","title":"Accelerating Neural Transformer via an Average Attention Network","pathname":"/transformer_evolution_paper/others/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Others"}]},{"id":"V0nbBew6SADsh9IWfRHK","title":"Do Transformer Modifications Transfer Across Implementations and Applications?","pathname":"/transformer_evolution_paper/others/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Others"}]},{"id":"PhUIwyhaP8wOriGBAQqW","title":"Object-Centric Learning with Slot Attention","pathname":"/transformer_evolution_paper/others/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Others"}]},{"id":"HSNnZy0jGgzhozvUqafy","title":"Do Transformer Modifications Transfer Across Implementations and Applications?","pathname":"/transformer_evolution_paper/others/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Others"}]},{"id":"1XrlQPMcMmg9JlIDdcRr","title":"Why self-attention is Natural for Sequence-to-Sequence Problems? A Perspective from Symmetries","pathname":"/transformer_evolution_paper/others/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Others"}]},{"id":"Wkgx1KPa5z5Y6OZssMId","title":"LongConv","pathname":"/transformer_evolution_paper/longconv","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"9NL8ioOPnPbMhHUYX3gu","title":"Legendre Memory Units: Continuous-Time Representation in Recurrent Neural Networks","pathname":"/transformer_evolution_paper/longconv/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"JcjMP87oW3I8Ch4HWHo6","title":"Parallelizing Legendre Memory Unit Training","pathname":"/transformer_evolution_paper/longconv/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"Jata87xg6XLDq4a3WYXX","title":"Simplified State Space Layers for Sequence Modeling","pathname":"/transformer_evolution_paper/longconv/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"8oY4SOy7Rd8iPWzGeCiL","title":"Pretraining Without Attention","pathname":"/transformer_evolution_paper/longconv/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"dLXgQSI6jYLzRSbuG9Oj","title":"What Makes Convolutional Models Great on Long Sequence Modeling?","pathname":"/transformer_evolution_paper/longconv/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"M1oPRs5bSR8IKyylLQ64","title":"Hungry Hungry Hippos: Towards Language Modeling with State Space Models","pathname":"/transformer_evolution_paper/longconv/006","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"ltdKp26CRb3Uzr881CYA","title":"Hyena Hierarchy: Towards Larger Convolutional Language Models","pathname":"/transformer_evolution_paper/longconv/007","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"DybShMONVTFul3j9l1wX","title":"RWKV","pathname":"/transformer_evolution_paper/longconv/008","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"c4bUKL1E3xqW7TSPrJLd","title":"Simple Hardware-Efficient Long Convolutions for Sequence Modeling","pathname":"/transformer_evolution_paper/longconv/009","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"YECc8IHI7ihDjKjqhJic","title":"Time-aware large kernel convolutions","pathname":"/transformer_evolution_paper/longconv/010","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"0eRI1sfxdtboqhPwKHbr","title":"Resurrecting Recurrent Neural Networks for Long Sequences","pathname":"/transformer_evolution_paper/longconv/011","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"llFmfRujyViWGjZIzi1J","title":"CKConv: Continuous Kernel Convolution For Sequential Data","pathname":"/transformer_evolution_paper/longconv/012","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"PqsEode3Dh2gBstXo35U","title":"FlexConv: Continuous Kernel Convolutions with Differentiable Kernel Sizes","pathname":"/transformer_evolution_paper/longconv/013","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"MSpqSMtIigPux570Bolz","title":"Towards a General Purpose CNN for Long Range Dependencies in ND","pathname":"/transformer_evolution_paper/longconv/014","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LongConv"}]},{"id":"chtW1uLOy1cz0bQnLPHT","title":"Rnn","pathname":"/transformer_evolution_paper/rnn","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"FO55zG9AsElM0NYCUDIH","title":"When Attention Meets Fast Recurrence: Training Language Models with Reduced Compute","pathname":"/transformer_evolution_paper/rnn/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Rnn"}]},{"id":"jLe5euI5iZLYwt4MG6Vo","title":"Linear Transformers Are Secretly Fast Weight Programmers","pathname":"/transformer_evolution_paper/rnn/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Rnn"}]},{"id":"gMxxWAEMa6yWGnQYtgr4","title":"Going Beyond Linear Transformers with Recurrent Fast Weight Programmers","pathname":"/transformer_evolution_paper/rnn/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Rnn"}]},{"id":"Tu05Djm2NiHbCNfKxmeZ","title":"Parallelizing Linear Recurrent Neural Nets Over Sequence Length","pathname":"/transformer_evolution_paper/rnn/004","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Rnn"}]},{"id":"4EXEGGul8lckTR48NsHm","title":"Quasi-recurrent neural networks","pathname":"/transformer_evolution_paper/rnn/005","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Rnn"}]},{"id":"dYaMrl0mHLkpCNZ9EyFM","title":"CrossAttention","pathname":"/transformer_evolution_paper/crossattention","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"JKXZPJy7gLNNjpSTcnsu","title":"Neural Machine Translation in Linear Time","pathname":"/transformer_evolution_paper/crossattention/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"CrossAttention"}]},{"id":"fnUs7zinrtJV0seUFLPP","title":"Inference","pathname":"/transformer_evolution_paper/inference","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"RWfoxyVbXBOcsDtcI8QA","title":"Extrapolation","pathname":"/transformer_evolution_paper/inference/extrapolation","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Inference"}]},{"id":"4EkGQ3d44BZhz4ebiNkq","title":"Parallel Context Windows for Large Language Models","pathname":"/transformer_evolution_paper/inference/extrapolation/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Inference"},{"label":"Extrapolation"}]},{"id":"BbR7KplOFu3TlSkBChDI","title":"Structured Prompting: Scaling In-Context Learning to 1,000 Examples","pathname":"/transformer_evolution_paper/inference/extrapolation/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Inference"},{"label":"Extrapolation"}]},{"id":"K23QmkynvjV7naOL1wHh","title":"Naive Bayes-based Context Extension","pathname":"/transformer_evolution_paper/inference/extrapolation/003","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Inference"},{"label":"Extrapolation"}]},{"id":"C7fbVyLAt2vknqAYPKqW","title":"Peft","pathname":"/transformer_evolution_paper/peft","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"ZCjJSJT0mYd18Ajdhs7M","title":"Parameter-Efficient Fine-Tuning without Introducing New Latency","pathname":"/transformer_evolution_paper/peft/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Peft"}]},{"id":"pmN7vXPgXOYBuU97z0Co","title":"Make Your Pre-trained Model Reversible: From Parameter to Memory Efficient Fine-Tuning","pathname":"/transformer_evolution_paper/peft/002","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"Peft"}]},{"id":"SqJzgHSSVtRezq6qtnIU","title":"LLM","pathname":"/transformer_evolution_paper/llm","siteSpaceId":"sitesp_ThjXY","description":""},{"id":"89noTXy3oFOceiQksUcU","title":"LLM Details Summary","pathname":"/transformer_evolution_paper/llm/000","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LLM"}]},{"id":"tFXrNgrmPU52TCcukG24","title":"What Language Model to Train if You Have One Million GPU Hours?","pathname":"/transformer_evolution_paper/llm/001","siteSpaceId":"sitesp_ThjXY","description":"","breadcrumbs":[{"label":"LLM"}]}]}