import { DailyPapersApiResponse } from "./types.mts"; export const dailyPapersSample: DailyPapersApiResponse = [ { "paper":{ "id":"2401.04081", "authors":[ { "_id":"659ccb1818ad5521982c6032", "user":{ "avatarUrl":"/avatars/71291f75e2ab20818537159179f904a6.svg", "isPro":false, "fullname":"Maciej Pióro", "user":"maciek-pioro", "type":"user" }, "name":"Maciej Pióro", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:15:29.785Z", "hidden":false }, { "_id":"659ccb1818ad5521982c6033", "name":"Kamil Ciebiera", "hidden":false }, { "_id":"659ccb1818ad5521982c6034", "name":"Krystian Król", "hidden":false }, { "_id":"659ccb1818ad5521982c6035", "name":"Jan Ludziejewski", "hidden":false }, { "_id":"659ccb1818ad5521982c6036", "name":"Sebastian Jaszczur", "hidden":false } ], "publishedAt":"2024-01-08T18:35:07.000Z", "title":"MoE-Mamba: Efficient Selective State Space Models with Mixture of\n Experts", "summary":"State Space Models (SSMs) have become serious contenders in the field of\nsequential modeling, challenging the dominance of Transformers. At the same\ntime, Mixture of Experts (MoE) has significantly improved Transformer-based\nLLMs, including recent state-of-the-art open-source models. We propose that to\nunlock the potential of SSMs for scaling, they should be combined with MoE. We\nshowcase this on Mamba, a recent SSM-based model that achieves remarkable,\nTransformer-like performance. Our model, MoE-Mamba, outperforms both Mamba and\nTransformer-MoE. In particular, MoE-Mamba reaches the same performance as Mamba\nin 2.2x less training steps while preserving the inference performance gains of\nMamba against the Transformer.", "upvotes":57 }, "publishedAt":"2024-01-09T04:27:05.325Z", "title":"MoE-Mamba: Efficient Selective State Space Models with Mixture of Experts", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/arhkhspbmHKeuYIc2mmKh.png", "numComments":4 }, { "paper":{ "id":"2401.05561", "authors":[ { "_id":"65a09d6ce969415381f1a31d", "name":"Lichao Sun", "hidden":false }, { "_id":"65a09d6ce969415381f1a31e", "name":"Yue Huang", "hidden":false }, { "_id":"65a09d6ce969415381f1a31f", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/64b82c659ebb69a79f0073f6/INEHG7kijEHOhFZMjdBIM.png?w=200&h=200&f=face", "isPro":false, "fullname":"Haoran Wang", "user":"hwang219", "type":"user" }, "name":"Haoran Wang", "status":"claimed_verified", "statusLastChangedAt":"2024-01-15T07:42:05.891Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a320", "name":"Siyuan Wu", "hidden":false }, { "_id":"65a09d6ce969415381f1a321", "user":{ "avatarUrl":"/avatars/4290bc4a6d1a1440b76d9e34ceeea4e3.svg", "isPro":false, "fullname":"Qihui Zhang", "user":"7Hui", "type":"user" }, "name":"Qihui Zhang", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T13:42:12.968Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a322", "name":"Chujie Gao", "hidden":false }, { "_id":"65a09d6ce969415381f1a323", "name":"Yixin Huang", "hidden":false }, { "_id":"65a09d6ce969415381f1a324", "name":"Wenhan Lyu", "hidden":false }, { "_id":"65a09d6ce969415381f1a325", "name":"Yixuan Zhang", "hidden":false }, { "_id":"65a09d6ce969415381f1a326", "user":{ "avatarUrl":"/avatars/03fb549171d3b103914aa64103e5739e.svg", "isPro":false, "fullname":"Xiner Li", "user":"hyanan16", "type":"user" }, "name":"Xiner Li", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:49:50.423Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a327", "name":"Zhengliang Liu", "hidden":false }, { "_id":"65a09d6ce969415381f1a328", "user":{ "avatarUrl":"/avatars/36558928bd04be7f49837d4c603681d7.svg", "isPro":false, "fullname":"Yixin Liu", "user":"henryL7", "type":"user" }, "name":"Yixin Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:50:25.731Z", "hidden":true }, { "_id":"65a09d6ce969415381f1a329", "name":"Yijue Wang", "hidden":false }, { "_id":"65a09d6ce969415381f1a32a", "name":"Zhikun Zhang", "hidden":false }, { "_id":"65a09d6ce969415381f1a32b", "name":"Bhavya Kailkhura", "hidden":false }, { "_id":"65a09d6ce969415381f1a32c", "user":{ "avatarUrl":"/avatars/c87c273ca628dbcddccbf1ee19b2ce33.svg", "isPro":false, "fullname":"Caiming Xiong", "user":"cxiong", "type":"user" }, "name":"Caiming Xiong", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:51:13.398Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a32d", "name":"Chao Zhang", "hidden":false }, { "_id":"65a09d6ce969415381f1a32e", "name":"Chaowei Xiao", "hidden":false }, { "_id":"65a09d6ce969415381f1a32f", "user":{ "avatarUrl":"/avatars/430560ec2c2547f819225769ab432f30.svg", "isPro":false, "fullname":"Chunyuan Li", "user":"Chunyuan24", "type":"user" }, "name":"Chunyuan Li", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:51:35.436Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a330", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/sLIrNelAWPVOy4e3oo5LB.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Eric Xing", "user":"EricX003", "type":"user" }, "name":"Eric Xing", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:51:42.959Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a331", "user":{ "avatarUrl":"/avatars/836e61be4aeda2080ddf2db9f2626cc6.svg", "isPro":false, "fullname":"Furong Huang Lab at UMD", "user":"furongh-lab", "type":"user" }, "name":"Furong Huang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:51:53.877Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a332", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/62aa53fc5df2251f7df798c6/4wzcKNPjlZ2iKkvgZA-t5.png?w=200&h=200&f=face", "isPro":false, "fullname":"Hao Liu", "user":"haoliu", "type":"user" }, "name":"Hao Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:52:12.162Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a333", "name":"Heng Ji", "hidden":false }, { "_id":"65a09d6ce969415381f1a334", "user":{ "avatarUrl":"/avatars/6e45e9dbaa5b5410b94dd1e17eabd2f3.svg", "isPro":false, "fullname":"Hongyi Wang", "user":"hwang595", "type":"user" }, "name":"Hongyi Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:52:53.969Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a335", "name":"Huan Zhang", "hidden":false }, { "_id":"65a09d6ce969415381f1a336", "name":"Huaxiu Yao", "hidden":false }, { "_id":"65a09d6ce969415381f1a337", "name":"Manolis Kellis", "hidden":false }, { "_id":"65a09d6ce969415381f1a338", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1667770136112-636826f95bb06007ea0e911e.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Marinka Zitnik", "user":"marinkaz", "type":"user" }, "name":"Marinka Zitnik", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:54:07.144Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a339", "user":{ "avatarUrl":"/avatars/42c182189f09f228bf4f8eabb0168bfc.svg", "isPro":false, "fullname":"Meng Jiang", "user":"mjiang89", "type":"user" }, "name":"Meng Jiang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:54:57.296Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a33a", "name":"Mohit Bansal", "hidden":false }, { "_id":"65a09d6ce969415381f1a33b", "user":{ "avatarUrl":"/avatars/7647f99abdcca4251fcac7783b6fcc8d.svg", "isPro":false, "fullname":"zou", "user":"jameszou707", "type":"user" }, "name":"James Zou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:55:28.539Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a33c", "user":{ "avatarUrl":"/avatars/24644537b352a4e43cc9fd05f954a43c.svg", "isPro":false, "fullname":"jianpei.wjp", "user":"jianpei", "type":"user" }, "name":"Jian Pei", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:55:37.180Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a33d", "name":"Jian Liu", "hidden":false }, { "_id":"65a09d6ce969415381f1a33e", "user":{ "avatarUrl":"/avatars/4a63eac71eb30f70b1a0e9d4708f26c1.svg", "isPro":false, "fullname":"Jianfeng Gao", "user":"wyngjf", "type":"user" }, "name":"Jianfeng Gao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:55:49.797Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a33f", "name":"Jiawei Han", "hidden":false }, { "_id":"65a09d6ce969415381f1a340", "user":{ "avatarUrl":"/avatars/1a9e6eb47c14fe278b8e1d907d518f46.svg", "isPro":false, "fullname":"Jieyu Zhao", "user":"jieyuz", "type":"user" }, "name":"Jieyu Zhao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:56:16.185Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a341", "name":"Jiliang Tang", "hidden":false }, { "_id":"65a09d6ce969415381f1a342", "user":{ "avatarUrl":"/avatars/18daf2de5671e711dc745388dd60569d.svg", "isPro":false, "fullname":"Jindong Wang", "user":"jindongwang", "type":"user" }, "name":"Jindong Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:56:36.627Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a343", "name":"John Mitchell", "hidden":false }, { "_id":"65a09d6ce969415381f1a344", "name":"Kai Shu", "hidden":false }, { "_id":"65a09d6ce969415381f1a345", "user":{ "avatarUrl":"/avatars/5240fa09876b90df763aee0e5a1c24e8.svg", "isPro":false, "fullname":"Kaidi Xu", "user":"kaidiXu", "type":"user" }, "name":"Kaidi Xu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:57:22.933Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a346", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1622653364258-noauth.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Kai-Wei Chang", "user":"kaiweichang", "type":"user" }, "name":"Kai-Wei Chang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:57:31.794Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a347", "name":"Lifang He", "hidden":false }, { "_id":"65a09d6ce969415381f1a348", "name":"Lifu Huang", "hidden":false }, { "_id":"65a09d6ce969415381f1a349", "user":{ "avatarUrl":"/avatars/6f187dd88e88caf1fe31127c5d0827c1.svg", "isPro":false, "fullname":"Heinrich", "user":"backm007", "type":"user" }, "name":"Michael Backes", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:58:01.469Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a34a", "name":"Neil Zhenqiang Gong", "hidden":false }, { "_id":"65a09d6ce969415381f1a34b", "name":"Philip S. Yu", "hidden":false }, { "_id":"65a09d6ce969415381f1a34c", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/6495dd0b71f6708e0f990032/PBIjdKNnpkxvR_3djCGVm.png?w=200&h=200&f=face", "isPro":false, "fullname":"Pin-Yu Chen", "user":"pinyuchen", "type":"user" }, "name":"Pin-Yu Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:58:27.459Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a34d", "user":{ "avatarUrl":"/avatars/06cc76feebba0cc80ebb8f4ff86f6d9b.svg", "isPro":false, "fullname":"Quanquan Gu", "user":"thughost", "type":"user" }, "name":"Quanquan Gu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:58:35.118Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a34e", "name":"Ran Xu", "hidden":false }, { "_id":"65a09d6ce969415381f1a34f", "name":"Rex Ying", "hidden":false }, { "_id":"65a09d6ce969415381f1a350", "name":"Shuiwang Ji", "hidden":false }, { "_id":"65a09d6ce969415381f1a351", "user":{ "avatarUrl":"/avatars/e393586c34ad7e35091db8e13a8f2166.svg", "isPro":false, "fullname":"Suman jana", "user":"Suman116", "type":"user" }, "name":"Suman Jana", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:59:11.212Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a352", "name":"Tianlong Chen", "hidden":false }, { "_id":"65a09d6ce969415381f1a353", "name":"Tianming Liu", "hidden":false }, { "_id":"65a09d6ce969415381f1a354", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/VJ4cDyjp5M3V5WmI5gPIU.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Tianyi Zhou", "user":"zhoutianyi", "type":"user" }, "name":"Tianyi Zhou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:59:54.064Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a355", "name":"Willian Wang", "hidden":true }, { "_id":"65a09d6ce969415381f1a356", "name":"Xiang Li", "hidden":false }, { "_id":"65a09d6ce969415381f1a357", "user":{ "avatarUrl":"/avatars/7a48a2dac4e6ebb9e775022e15ddc5a7.svg", "isPro":false, "fullname":"zhangxiangliang", "user":"ZhangXiangliang", "type":"user" }, "name":"Xiangliang Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T11:00:18.435Z", "hidden":false }, { "_id":"65a09d6ce969415381f1a358", "name":"Xiao Wang", "hidden":false }, { "_id":"65a09d6ce969415381f1a359", "name":"Xing Xie", "hidden":false }, { "_id":"65a09d6ce969415381f1a35a", "name":"Xun Chen", "hidden":false }, { "_id":"65a09d6ce969415381f1a35b", "name":"Xuyu Wang", "hidden":false }, { "_id":"65a09d6ce969415381f1a35c", "name":"Yan Liu", "hidden":false }, { "_id":"65a09d6ce969415381f1a35d", "name":"Yanfang Ye", "hidden":false }, { "_id":"65a09d6ce969415381f1a35e", "name":"Yinzhi Cao", "hidden":false }, { "_id":"65a09d6ce969415381f1a35f", "name":"Yue Zhao", "hidden":false } ], "publishedAt":"2024-01-10T22:07:21.000Z", "title":"TrustLLM: Trustworthiness in Large Language Models", "summary":"Large language models (LLMs), exemplified by ChatGPT, have gained\nconsiderable attention for their excellent natural language processing\ncapabilities. Nonetheless, these LLMs present many challenges, particularly in\nthe realm of trustworthiness. Therefore, ensuring the trustworthiness of LLMs\nemerges as an important topic. This paper introduces TrustLLM, a comprehensive\nstudy of trustworthiness in LLMs, including principles for different dimensions\nof trustworthiness, established benchmark, evaluation, and analysis of\ntrustworthiness for mainstream LLMs, and discussion of open challenges and\nfuture directions. Specifically, we first propose a set of principles for\ntrustworthy LLMs that span eight different dimensions. Based on these\nprinciples, we further establish a benchmark across six dimensions including\ntruthfulness, safety, fairness, robustness, privacy, and machine ethics. We\nthen present a study evaluating 16 mainstream LLMs in TrustLLM, consisting of\nover 30 datasets. Our findings firstly show that in general trustworthiness and\nutility (i.e., functional effectiveness) are positively related. Secondly, our\nobservations reveal that proprietary LLMs generally outperform most open-source\ncounterparts in terms of trustworthiness, raising concerns about the potential\nrisks of widely accessible open-source LLMs. However, a few open-source LLMs\ncome very close to proprietary ones. Thirdly, it is important to note that some\nLLMs may be overly calibrated towards exhibiting trustworthiness, to the extent\nthat they compromise their utility by mistakenly treating benign prompts as\nharmful and consequently not responding. Finally, we emphasize the importance\nof ensuring transparency not only in the models themselves but also in the\ntechnologies that underpin trustworthiness. Knowing the specific trustworthy\ntechnologies that have been employed is crucial for analyzing their\neffectiveness.", "upvotes":50 }, "publishedAt":"2024-01-12T02:01:16.768Z", "title":"TrustLLM: Trustworthiness in Large Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/iY0paWKuqOqqGmsnnQ_u9.png", "numComments":2 }, { "paper":{ "id":"2401.06105", "authors":[ { "_id":"65a0bd799185dcca3061a974", "user":{ "avatarUrl":"/avatars/af6cc5a6dea128fc2c9700ec768018c4.svg", "isPro":false, "fullname":"Moab Arar", "user":"moabarar", "type":"user" }, "name":"Moab Arar", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:40:45.421Z", "hidden":false }, { "_id":"65a0bd799185dcca3061a975", "user":{ "avatarUrl":"/avatars/849e41404df698cc89c68939de45ec9a.svg", "isPro":false, "fullname":"Andrey Voynov", "user":"avoin", "type":"user" }, "name":"Andrey Voynov", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:40:51.688Z", "hidden":false }, { "_id":"65a0bd799185dcca3061a976", "user":{ "avatarUrl":"/avatars/bfc2a2169c080304124eb7f9ab04306d.svg", "isPro":false, "fullname":"Amir Hertz", "user":"amirhertz", "type":"user" }, "name":"Amir Hertz", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:40:57.655Z", "hidden":false }, { "_id":"65a0bd799185dcca3061a977", "user":{ "avatarUrl":"/avatars/4f728a5b70c9fe4a64e80e2b643ca620.svg", "isPro":false, "fullname":"Omri Avrahami", "user":"omriav", "type":"user" }, "name":"Omri Avrahami", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:41:04.417Z", "hidden":false }, { "_id":"65a0bd799185dcca3061a978", "user":{ "avatarUrl":"/avatars/60fd9305ccc56f17d6007f781c54fe2b.svg", "isPro":false, "fullname":"Shlomi Fruchter", "user":"shlomif", "type":"user" }, "name":"Shlomi Fruchter", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:01:53.899Z", "hidden":false }, { "_id":"65a0bd799185dcca3061a979", "name":"Yael Pritch", "hidden":false }, { "_id":"65a0bd799185dcca3061a97a", "user":{ "avatarUrl":"/avatars/008ecb3daa4c8187b5f339f1176b3c39.svg", "isPro":false, "fullname":"Daniel Cohen-Or", "user":"cohenor", "type":"user" }, "name":"Daniel Cohen-Or", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:02:04.337Z", "hidden":false }, { "_id":"65a0bd799185dcca3061a97b", "name":"Ariel Shamir", "hidden":false } ], "publishedAt":"2024-01-11T18:35:33.000Z", "title":"PALP: Prompt Aligned Personalization of Text-to-Image Models", "summary":"Content creators often aim to create personalized images using personal\nsubjects that go beyond the capabilities of conventional text-to-image models.\nAdditionally, they may want the resulting image to encompass a specific\nlocation, style, ambiance, and more. Existing personalization methods may\ncompromise personalization ability or the alignment to complex textual prompts.\nThis trade-off can impede the fulfillment of user prompts and subject fidelity.\nWe propose a new approach focusing on personalization methods for a\nsingle prompt to address this issue. We term our approach prompt-aligned\npersonalization. While this may seem restrictive, our method excels in\nimproving text alignment, enabling the creation of images with complex and\nintricate prompts, which may pose a challenge for current techniques. In\nparticular, our method keeps the personalized model aligned with a target\nprompt using an additional score distillation sampling term. We demonstrate the\nversatility of our method in multi- and single-shot settings and further show\nthat it can compose multiple subjects or use inspiration from reference images,\nsuch as artworks. We compare our approach quantitatively and qualitatively with\nexisting baselines and state-of-the-art techniques.", "upvotes":40 }, "publishedAt":"2024-01-12T04:18:04.903Z", "title":"PALP: Prompt Aligned Personalization of Text-to-Image Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/fRhm6EdVP_p8ZLsvYYJ7U.mp4", "numComments":2 }, { "paper":{ "id":"2401.02994", "authors":[ { "_id":"659cbc9e53f784cdcabe92c3", "user":{ "avatarUrl":"/avatars/433c5d02ceb2b7a18a29e9d5ec848350.svg", "isPro":false, "fullname":"Xiaoding Lu", "user":"xiaoding", "type":"user" }, "name":"Xiaoding Lu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:07:10.374Z", "hidden":false }, { "_id":"659cbc9e53f784cdcabe92c4", "user":{ "avatarUrl":"/avatars/3d6a73ada67636fde04caeb51c9a2953.svg", "isPro":false, "fullname":"Adian Liusie", "user":"adianl", "type":"user" }, "name":"Adian Liusie", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:07:16.712Z", "hidden":false }, { "_id":"659cbc9e53f784cdcabe92c5", "user":{ "avatarUrl":"/avatars/587d4e116584c9960d27e59b10cfd1a3.svg", "isPro":false, "fullname":"V R", "user":"vyasraina", "type":"user" }, "name":"Vyas Raina", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:07:23.535Z", "hidden":false }, { "_id":"659cbc9e53f784cdcabe92c6", "user":{ "avatarUrl":"/avatars/5a212220927b9ce6e3dbc895ff2e7481.svg", "isPro":false, "fullname":"Yuwen Zhang", "user":"thatyuwen", "type":"user" }, "name":"Yuwen Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:07:45.399Z", "hidden":false }, { "_id":"659cbc9e53f784cdcabe92c7", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1632998777241-noauth.jpeg?w=200&h=200&f=face", "isPro":true, "fullname":"William Beauchamp", "user":"Meliodia", "type":"user" }, "name":"William Beauchamp", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:08:31.350Z", "hidden":false } ], "publishedAt":"2024-01-04T07:45:49.000Z", "title":"Blending Is All You Need: Cheaper, Better Alternative to\n Trillion-Parameters LLM", "summary":"In conversational AI research, there's a noticeable trend towards developing\nmodels with a larger number of parameters, exemplified by models like ChatGPT.\nWhile these expansive models tend to generate increasingly better chat\nresponses, they demand significant computational resources and memory. This\nstudy explores a pertinent question: Can a combination of smaller models\ncollaboratively achieve comparable or enhanced performance relative to a\nsingular large model? We introduce an approach termed \"blending\", a\nstraightforward yet effective method of integrating multiple chat AIs. Our\nempirical evidence suggests that when specific smaller models are\nsynergistically blended, they can potentially outperform or match the\ncapabilities of much larger counterparts. For instance, integrating just three\nmodels of moderate size (6B/13B paramaeters) can rival or even surpass the\nperformance metrics of a substantially larger model like ChatGPT (175B+\nparamaters). This hypothesis is rigorously tested using A/B testing\nmethodologies with a large user base on the Chai research platform over a span\nof thirty days. The findings underscore the potential of the \"blending\"\nstrategy as a viable approach for enhancing chat AI efficacy without a\ncorresponding surge in computational demands.", "upvotes":39 }, "publishedAt":"2024-01-09T03:25:20.535Z", "title":"Blending Is All You Need: Cheaper, Better Alternative to Trillion-Parameters LLM", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/4PpM1Gxw0hKboNaIzrk5a.png", "numComments":0 }, { "paper":{ "id":"2401.04468", "authors":[ { "_id":"659e050918ea5479fbf15fc1", "user":{ "avatarUrl":"/avatars/77298e99d2797cf917fdddc6d6de46eb.svg", "isPro":false, "fullname":"weimin wang ", "user":"weiminwang", "type":"user" }, "name":"Weimin Wang", "status":"claimed_verified", "statusLastChangedAt":"2024-01-11T08:12:59.343Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fc2", "name":"Jiawei Liu", "hidden":false }, { "_id":"659e050918ea5479fbf15fc3", "user":{ "avatarUrl":"/avatars/1132d1ee68fb58ec635d57c8175caacd.svg", "isPro":false, "fullname":"Zhijie Lin", "user":"Ikuinen", "type":"user" }, "name":"Zhijie Lin", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:47:28.063Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fc4", "user":{ "avatarUrl":"/avatars/c22300f7ad7e6a7a8b4c57ebdb5172ce.svg", "isPro":false, "fullname":"Yan Jiangqiao", "user":"Jiangqiao", "type":"user" }, "name":"Jiangqiao Yan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:47:37.928Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fc5", "name":"Shuo Chen", "hidden":false }, { "_id":"659e050918ea5479fbf15fc6", "user":{ "avatarUrl":"/avatars/19acc8f679054e0c5347b8b137f11c97.svg", "isPro":false, "fullname":"Chetwin Low", "user":"ChetwinLow", "type":"user" }, "name":"Chetwin Low", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:48:26.606Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fc7", "name":"Tuyen Hoang", "hidden":false }, { "_id":"659e050918ea5479fbf15fc8", "name":"Jie Wu", "hidden":false }, { "_id":"659e050918ea5479fbf15fc9", "user":{ "avatarUrl":"/avatars/0ec3c55d445264d43c0430f9edf88bf8.svg", "isPro":false, "fullname":"Jun Hao Liew", "user":"junhao910323", "type":"user" }, "name":"Jun Hao Liew", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:49:13.476Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fca", "user":{ "avatarUrl":"/avatars/25da11a27e250739b13141af0e5a50d1.svg", "isPro":false, "fullname":"Hanshu YAN", "user":"hansyan", "type":"user" }, "name":"Hanshu Yan", "status":"claimed_verified", "statusLastChangedAt":"2024-01-10T08:38:48.442Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fcb", "user":{ "avatarUrl":"/avatars/bc574036287170a77057893efaa48e2d.svg", "isPro":false, "fullname":"Zhou", "user":"DaQuan21", "type":"user" }, "name":"Daquan Zhou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:49:35.998Z", "hidden":false }, { "_id":"659e050918ea5479fbf15fcc", "name":"Jiashi Feng", "hidden":false } ], "publishedAt":"2024-01-09T10:12:52.000Z", "title":"MagicVideo-V2: Multi-Stage High-Aesthetic Video Generation", "summary":"The growing demand for high-fidelity video generation from textual\ndescriptions has catalyzed significant research in this field. In this work, we\nintroduce MagicVideo-V2 that integrates the text-to-image model, video motion\ngenerator, reference image embedding module and frame interpolation module into\nan end-to-end video generation pipeline. Benefiting from these architecture\ndesigns, MagicVideo-V2 can generate an aesthetically pleasing, high-resolution\nvideo with remarkable fidelity and smoothness. It demonstrates superior\nperformance over leading Text-to-Video systems such as Runway, Pika 1.0, Morph,\nMoon Valley and Stable Video Diffusion model via user evaluation at large\nscale.", "upvotes":38 }, "publishedAt":"2024-01-10T02:46:38.295Z", "title":"MagicVideo-V2: Multi-Stage High-Aesthetic Video Generation", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/UTsPPOJFHtKOgZDLdcI-x.png", "numComments":5 }, { "paper":{ "id":"2401.05252", "authors":[ { "_id":"659f529fd56cbade8db370aa", "user":{ "avatarUrl":"/avatars/4d35f728b41f93881a9b67c337f4d1df.svg", "isPro":false, "fullname":"Chen", "user":"Lawrence-cj", "type":"user" }, "name":"Junsong Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:37:30.196Z", "hidden":false }, { "_id":"659f529fd56cbade8db370ab", "name":"Yue Wu", "hidden":false }, { "_id":"659f529fd56cbade8db370ac", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/ofYN04WWtD60oeDP45pQN.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Simian Luo", "user":"SimianLuo", "type":"user" }, "name":"Simian Luo", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:33:40.265Z", "hidden":false }, { "_id":"659f529fd56cbade8db370ad", "user":{ "avatarUrl":"/avatars/015a92884d7b4476ab311ea9560318eb.svg", "isPro":false, "fullname":"xieenze", "user":"xieenze", "type":"user" }, "name":"Enze Xie", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:38:55.039Z", "hidden":false }, { "_id":"659f529fd56cbade8db370ae", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1649681653581-5f7fbd813e94f16a85448745.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Sayak Paul", "user":"sayakpaul", "type":"user" }, "name":"Sayak Paul", "status":"claimed_verified", "statusLastChangedAt":"2024-01-11T08:13:18.092Z", "hidden":false }, { "_id":"659f529fd56cbade8db370af", "name":"Ping Luo", "hidden":false }, { "_id":"659f529fd56cbade8db370b0", "user":{ "avatarUrl":"/avatars/f75a33b6bba1bb6aecbe018dcef39131.svg", "isPro":false, "fullname":"Hang Zhao", "user":"sgzhaohang", "type":"user" }, "name":"Hang Zhao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:35:02.476Z", "hidden":false }, { "_id":"659f529fd56cbade8db370b1", "user":{ "avatarUrl":"/avatars/c5cbb87cd51cc1341844f67d42c55151.svg", "isPro":false, "fullname":"lzg", "user":"lizhenguo", "type":"user" }, "name":"Zhenguo Li", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:35:30.043Z", "hidden":false } ], "publishedAt":"2024-01-10T16:27:38.000Z", "title":"PIXART-δ: Fast and Controllable Image Generation with Latent\n Consistency Models", "summary":"This technical report introduces PIXART-{\\delta}, a text-to-image synthesis\nframework that integrates the Latent Consistency Model (LCM) and ControlNet\ninto the advanced PIXART-{\\alpha} model. PIXART-{\\alpha} is recognized for its\nability to generate high-quality images of 1024px resolution through a\nremarkably efficient training process. The integration of LCM in\nPIXART-{\\delta} significantly accelerates the inference speed, enabling the\nproduction of high-quality images in just 2-4 steps. Notably, PIXART-{\\delta}\nachieves a breakthrough 0.5 seconds for generating 1024x1024 pixel images,\nmarking a 7x improvement over the PIXART-{\\alpha}. Additionally,\nPIXART-{\\delta} is designed to be efficiently trainable on 32GB V100 GPUs\nwithin a single day. With its 8-bit inference capability (von Platen et al.,\n2023), PIXART-{\\delta} can synthesize 1024px images within 8GB GPU memory\nconstraints, greatly enhancing its usability and accessibility. Furthermore,\nincorporating a ControlNet-like module enables fine-grained control over\ntext-to-image diffusion models. We introduce a novel ControlNet-Transformer\narchitecture, specifically tailored for Transformers, achieving explicit\ncontrollability alongside high-quality image generation. As a state-of-the-art,\nopen-source image generation model, PIXART-{\\delta} offers a promising\nalternative to the Stable Diffusion family of models, contributing\nsignificantly to text-to-image synthesis.", "upvotes":37 }, "publishedAt":"2024-01-11T03:19:58.535Z", "title":"PIXART-δ: Fast and Controllable Image Generation with Latent Consistency Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/l4v9s2xQHdqOJ8LY6hoXg.png", "numComments":3 }, { "paper":{ "id":"2401.06066", "authors":[ { "_id":"65a0a694db5d37ad5e8bca82", "user":{ "avatarUrl":"/avatars/896ed9f4cdbd317493b303d070b7e12a.svg", "isPro":false, "fullname":"Damai Dai", "user":"DeepSeekDDM", "type":"user" }, "name":"Damai Dai", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:06:25.776Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca83", "name":"Chengqi Deng", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca84", "user":{ "avatarUrl":"/avatars/528e9d84b65325317ba6871e429e50d6.svg", "isPro":false, "fullname":"Zhao Chenggang", "user":"zcgamazing", "type":"user" }, "name":"Chenggang Zhao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:06:49.782Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca85", "name":"R. X. Xu", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca86", "user":{ "avatarUrl":"/avatars/0fa1eb6ac6c1aeff3e65bc86a6617f64.svg", "isPro":false, "fullname":"Huazuo Gao", "user":"gaohuazuo", "type":"user" }, "name":"Huazuo Gao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:07:01.241Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca87", "user":{ "avatarUrl":"/avatars/e131cd22b55e4b62cc48779e512c3da1.svg", "isPro":false, "fullname":"deli chen", "user":"deli96", "type":"user" }, "name":"Deli Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:07:08.725Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca88", "user":{ "avatarUrl":"/avatars/ae01ac0296d6ce1277dacb6894f570b8.svg", "isPro":false, "fullname":"Jiashi Li", "user":"Beginlner", "type":"user" }, "name":"Jiashi Li", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:07:15.235Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca89", "user":{ "avatarUrl":"/avatars/fdbff8012fe80d5f2e861cbfa2675125.svg", "isPro":false, "fullname":"wangding zeng", "user":"zwd973-deepseek", "type":"user" }, "name":"Wangding Zeng", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:07:27.958Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca8a", "user":{ "avatarUrl":"/avatars/973f4662023f0bbbc94c01dc3bb3edd3.svg", "isPro":false, "fullname":"Xingkai Yu", "user":"GeeeekExplorer", "type":"user" }, "name":"Xingkai Yu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:07:33.898Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca8b", "name":"Y. Wu", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca8c", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/6539f6ea26df26ecd1393c37/9VJusLLAiLhUxfBAFggpF.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Zhenda Xie", "user":"zdaxie", "type":"user" }, "name":"Zhenda Xie", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:07:42.020Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca8d", "name":"Y. K. Li", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca8e", "name":"Panpan Huang", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca8f", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/6538815d1bdb3c40db94fbfa/id7aSY8JUgKK2agKWLERt.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Fuli Luo", "user":"luofuli", "type":"user" }, "name":"Fuli Luo", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:08:04.084Z", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca90", "name":"Chong Ruan", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca91", "name":"Zhifang Sui", "hidden":false }, { "_id":"65a0a694db5d37ad5e8bca92", "name":"Wenfeng Liang", "hidden":false } ], "publishedAt":"2024-01-11T17:31:42.000Z", "title":"DeepSeekMoE: Towards Ultimate Expert Specialization in\n Mixture-of-Experts Language Models", "summary":"In the era of large language models, Mixture-of-Experts (MoE) is a promising\narchitecture for managing computational costs when scaling up model parameters.\nHowever, conventional MoE architectures like GShard, which activate the top-K\nout of N experts, face challenges in ensuring expert specialization, i.e.\neach expert acquires non-overlapping and focused knowledge. In response, we\npropose the DeepSeekMoE architecture towards ultimate expert specialization. It\ninvolves two principal strategies: (1) finely segmenting the experts into mN\nones and activating mK from them, allowing for a more flexible combination of\nactivated experts; (2) isolating K_s experts as shared ones, aiming at\ncapturing common knowledge and mitigating redundancy in routed experts.\nStarting from a modest scale with 2B parameters, we demonstrate that\nDeepSeekMoE 2B achieves comparable performance with GShard 2.9B, which has 1.5\ntimes the expert parameters and computation. In addition, DeepSeekMoE 2B nearly\napproaches the performance of its dense counterpart with the same number of\ntotal parameters, which set the upper bound of MoE models. Subsequently, we\nscale up DeepSeekMoE to 16B parameters and show that it achieves comparable\nperformance with LLaMA2 7B, with only about 40% of computations. Further, our\npreliminary efforts to scale up DeepSeekMoE to 145B parameters consistently\nvalidate its substantial advantages over the GShard architecture, and show its\nperformance comparable with DeepSeek 67B, using only 28.5% (maybe even 18.2%)\nof computations.", "upvotes":28 }, "publishedAt":"2024-01-12T02:40:21.271Z", "title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/1HZxDdOSYIMaIsR1eyKBP.png", "numComments":0 }, { "paper":{ "id":"2401.04577", "authors":[ { "_id":"659e0f2b1692b39ff0bbfa72", "user":{ "avatarUrl":"/avatars/120ac6417b73627e98488afdc715227b.svg", "isPro":false, "fullname":"Alon Ziv", "user":"alonzi", "type":"user" }, "name":"Alon Ziv", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:04:48.360Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa73", "user":{ "avatarUrl":"/avatars/73519deba3176be9c23d49f749aee5da.svg", "isPro":false, "fullname":"Itai Gat", "user":"itaigat", "type":"user" }, "name":"Itai Gat", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:05:02.069Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa74", "name":"Gael Le Lan", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa75", "user":{ "avatarUrl":"/avatars/24aaaeb700690bc84ad0212ce4ae9bd4.svg", "isPro":false, "fullname":"Tal Remez", "user":"TalRemez", "type":"user" }, "name":"Tal Remez", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:05:29.272Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa76", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/IOWMd17Iwls0dXsY1OWjK.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Felix Kreuk", "user":"felixkreuk", "type":"user" }, "name":"Felix Kreuk", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:05:35.720Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa77", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1666708948380-noauth.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Alexandre Défossez", "user":"adefossez", "type":"user" }, "name":"Alexandre Défossez", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:05:41.700Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa78", "user":{ "avatarUrl":"/avatars/49f08d989ca505ae01bce5578a94f6fe.svg", "isPro":false, "fullname":"Jade Copet", "user":"JadeCopet", "type":"user" }, "name":"Jade Copet", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:05:47.556Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa79", "user":{ "avatarUrl":"/avatars/b7ccbddfa745db854dc342be1327cd53.svg", "isPro":false, "fullname":"Gabriel Synnaeve", "user":"gsynnaeve", "type":"user" }, "name":"Gabriel Synnaeve", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:05:54.092Z", "hidden":false }, { "_id":"659e0f2b1692b39ff0bbfa7a", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/6481e135578646b5c2386728/SPva4iNw0pORiCXD45cx9.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Yossi Adi", "user":"adiyoss", "type":"user" }, "name":"Yossi Adi", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:06:00.172Z", "hidden":false } ], "publishedAt":"2024-01-09T14:29:39.000Z", "title":"Masked Audio Generation using a Single Non-Autoregressive Transformer", "summary":"We introduce MAGNeT, a masked generative sequence modeling method that\noperates directly over several streams of audio tokens. Unlike prior work,\nMAGNeT is comprised of a single-stage, non-autoregressive transformer. During\ntraining, we predict spans of masked tokens obtained from a masking scheduler,\nwhile during inference we gradually construct the output sequence using several\ndecoding steps. To further enhance the quality of the generated audio, we\nintroduce a novel rescoring method in which, we leverage an external\npre-trained model to rescore and rank predictions from MAGNeT, which will be\nthen used for later decoding steps. Lastly, we explore a hybrid version of\nMAGNeT, in which we fuse between autoregressive and non-autoregressive models\nto generate the first few seconds in an autoregressive manner while the rest of\nthe sequence is being decoded in parallel. We demonstrate the efficiency of\nMAGNeT for the task of text-to-music and text-to-audio generation and conduct\nan extensive empirical evaluation, considering both objective metrics and human\nstudies. The proposed approach is comparable to the evaluated baselines, while\nbeing significantly faster (x7 faster than the autoregressive baseline).\nThrough ablation studies and analysis, we shed light on the importance of each\nof the components comprising MAGNeT, together with pointing to the trade-offs\nbetween autoregressive and non-autoregressive modeling, considering latency,\nthroughput, and generation quality. Samples are available on our demo page\nhttps://pages.cs.huji.ac.il/adiyoss-lab/MAGNeT.", "upvotes":27 }, "publishedAt":"2024-01-10T03:29:48.355Z", "title":"Masked Audio Generation using a Single Non-Autoregressive Transformer", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/DfwseQQErYPa8Ni225r52.png", "numComments":5 }, { "paper":{ "id":"2401.06104", "authors":[ { "_id":"65a0a3dadb5d37ad5e8b016a", "user":{ "avatarUrl":"/avatars/ff4462062beb17b9ad427ae730e7974d.svg", "isPro":false, "fullname":"Matanel Oren", "user":"matanelo", "type":"user" }, "name":"Matanel Oren", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:33:09.910Z", "hidden":false }, { "_id":"65a0a3dadb5d37ad5e8b016b", "user":{ "avatarUrl":"/avatars/6e240f0add27bf1a6c04a9618eccdf83.svg", "isPro":false, "fullname":"Michael Hassid", "user":"hassid", "type":"user" }, "name":"Michael Hassid", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:33:16.772Z", "hidden":false }, { "_id":"65a0a3dadb5d37ad5e8b016c", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/6481e135578646b5c2386728/SPva4iNw0pORiCXD45cx9.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Yossi Adi", "user":"adiyoss", "type":"user" }, "name":"Yossi Adi", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:33:23.532Z", "hidden":false }, { "_id":"65a0a3dadb5d37ad5e8b016d", "user":{ "avatarUrl":"/avatars/42b1ad679e84b3212e3770ca1e6d64f2.svg", "isPro":false, "fullname":"Roy Schwartz", "user":"royschwartz", "type":"user" }, "name":"Roy Schwartz", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:34:04.289Z", "hidden":false } ], "publishedAt":"2024-01-11T18:35:26.000Z", "title":"Transformers are Multi-State RNNs", "summary":"Transformers are considered conceptually different compared to the previous\ngeneration of state-of-the-art NLP models - recurrent neural networks (RNNs).\nIn this work, we demonstrate that decoder-only transformers can in fact be\nconceptualized as infinite multi-state RNNs - an RNN variant with unlimited\nhidden state size. We further show that pretrained transformers can be\nconverted into finite multi-state RNNs by fixing the size of their\nhidden state. We observe that several existing transformers cache compression\ntechniques can be framed as such conversion policies, and introduce a novel\npolicy, TOVA, which is simpler compared to these policies. Our experiments with\nseveral long range tasks indicate that TOVA outperforms all other baseline\npolicies, while being nearly on par with the full (infinite) model, and using\nin some cases only 1{8} of the original cache size. Our results\nindicate that transformer decoder LLMs often behave in practice as RNNs. They\nalso lay out the option of mitigating one of their most painful computational\nbottlenecks - the size of their cache memory. We publicly release our code at\nhttps://github.com/schwartz-lab-NLP/TOVA.", "upvotes":26 }, "publishedAt":"2024-01-12T02:28:43.419Z", "title":"Transformers are Multi-State RNNs", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/1E499u2_kDGHv5MrCOB2k.png", "numComments":3 }, { "paper":{ "id":"2401.05335", "authors":[ { "_id":"659f66569f682a3147bd7eb5", "name":"Mohamad Shahbazi", "hidden":false }, { "_id":"659f66569f682a3147bd7eb6", "user":{ "avatarUrl":"/avatars/89a79e1e894a3bd962e3ab0f19a23ad6.svg", "isPro":false, "fullname":"Liesbeth Claessens", "user":"CalishaMienona", "type":"user" }, "name":"Liesbeth Claessens", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T11:04:36.881Z", "hidden":false }, { "_id":"659f66569f682a3147bd7eb7", "user":{ "avatarUrl":"/avatars/0377c4e6b35f8bd1a29d38c99a5943c1.svg", "isPro":false, "fullname":"Michael Niemeyer", "user":"mniemeyer", "type":"user" }, "name":"Michael Niemeyer", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T11:05:06.967Z", "hidden":false }, { "_id":"659f66569f682a3147bd7eb8", "name":"Edo Collins", "hidden":false }, { "_id":"659f66569f682a3147bd7eb9", "user":{ "avatarUrl":"/avatars/0aee84d132a78d4ec71663836a57a245.svg", "isPro":false, "fullname":"Alessio Tonioni", "user":"Alessiot", "type":"user" }, "name":"Alessio Tonioni", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T11:05:17.664Z", "hidden":false }, { "_id":"659f66569f682a3147bd7eba", "name":"Luc Van Gool", "hidden":false }, { "_id":"659f66569f682a3147bd7ebb", "name":"Federico Tombari", "hidden":false } ], "publishedAt":"2024-01-10T18:59:53.000Z", "title":"InseRF: Text-Driven Generative Object Insertion in Neural 3D Scenes", "summary":"We introduce InseRF, a novel method for generative object insertion in the\nNeRF reconstructions of 3D scenes. Based on a user-provided textual description\nand a 2D bounding box in a reference viewpoint, InseRF generates new objects in\n3D scenes. Recently, methods for 3D scene editing have been profoundly\ntransformed, owing to the use of strong priors of text-to-image diffusion\nmodels in 3D generative modeling. Existing methods are mostly effective in\nediting 3D scenes via style and appearance changes or removing existing\nobjects. Generating new objects, however, remains a challenge for such methods,\nwhich we address in this study. Specifically, we propose grounding the 3D\nobject insertion to a 2D object insertion in a reference view of the scene. The\n2D edit is then lifted to 3D using a single-view object reconstruction method.\nThe reconstructed object is then inserted into the scene, guided by the priors\nof monocular depth estimation methods. We evaluate our method on various 3D\nscenes and provide an in-depth analysis of the proposed components. Our\nexperiments with generative insertion of objects in several 3D scenes indicate\nthe effectiveness of our method compared to the existing methods. InseRF is\ncapable of controllable and 3D-consistent object insertion without requiring\nexplicit 3D information as input. Please visit our project page at\nhttps://mohamad-shahbazi.github.io/inserf.", "upvotes":23 }, "publishedAt":"2024-01-11T03:54:00.756Z", "title":"InseRF: Text-Driven Generative Object Insertion in Neural 3D Scenes", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/KHwPYpncQy7UBY4FauF7X.mp4", "numComments":0 }, { "paper":{ "id":"2401.05675", "authors":[ { "_id":"65a0ad5990ea9d75eaa753b8", "user":{ "avatarUrl":"/avatars/3331316476ef9f79cd4294e28c88cfe0.svg", "isPro":false, "fullname":"Seung Hyun Lee", "user":"Seanlee235", "type":"user" }, "name":"Seung Hyun Lee", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:35:17.506Z", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753b9", "name":"Yinxiao Li", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753ba", "user":{ "avatarUrl":"/avatars/dcd2a666aba505575a79ffcc08a98c9d.svg", "isPro":false, "fullname":"Ian Kerr", "user":"KeJunjie", "type":"user" }, "name":"Junjie Ke", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:36:21.063Z", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753bb", "name":"Innfarn Yoo", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753bc", "name":"Han Zhang", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753bd", "user":{ "avatarUrl":"/avatars/4989bcdda7ba1ae51cab8235c3e39275.svg", "isPro":false, "fullname":"Jiahui Yu", "user":"jiahuiyu", "type":"user" }, "name":"Jiahui Yu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:36:52.645Z", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753be", "name":"Qifei Wang", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753bf", "name":"Fei Deng", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753c0", "user":{ "avatarUrl":"/avatars/8a8b586e86d79520c5f0a866ed2f0b80.svg", "isPro":false, "fullname":"Glenn Entis", "user":"GEinSF", "type":"user" }, "name":"Glenn Entis", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:37:49.208Z", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753c1", "user":{ "avatarUrl":"/avatars/a6719ec5f0a323b2504794dbf26a94d9.svg", "isPro":false, "fullname":"Junfeng He", "user":"JF1980", "type":"user" }, "name":"Junfeng He", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:37:55.773Z", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753c2", "name":"Gang Li", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753c3", "user":{ "avatarUrl":"/avatars/89dc61984a1fc7f8a09624186fe9fd53.svg", "isPro":false, "fullname":"sangpil", "user":"kimsangpil", "type":"user" }, "name":"Sangpil Kim", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:38:11.190Z", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753c4", "name":"Irfan Essa", "hidden":false }, { "_id":"65a0ad5990ea9d75eaa753c5", "name":"Feng Yang", "hidden":false } ], "publishedAt":"2024-01-11T05:36:36.000Z", "title":"Parrot: Pareto-optimal Multi-Reward Reinforcement Learning Framework for\n Text-to-Image Generation", "summary":"Recent works demonstrate that using reinforcement learning (RL) with quality\nrewards can enhance the quality of generated images in text-to-image (T2I)\ngeneration. However, a simple aggregation of multiple rewards may cause\nover-optimization in certain metrics and degradation in others, and it is\nchallenging to manually find the optimal weights. An effective strategy to\njointly optimize multiple rewards in RL for T2I generation is highly desirable.\nThis paper introduces Parrot, a novel multi-reward RL framework for T2I\ngeneration. Through the use of the batch-wise Pareto optimal selection, Parrot\nautomatically identifies the optimal trade-off among different rewards during\nthe RL optimization of the T2I generation. Additionally, Parrot employs a joint\noptimization approach for the T2I model and the prompt expansion network,\nfacilitating the generation of quality-aware text prompts, thus further\nenhancing the final image quality. To counteract the potential catastrophic\nforgetting of the original user prompt due to prompt expansion, we introduce\noriginal prompt centered guidance at inference time, ensuring that the\ngenerated image remains faithful to the user input. Extensive experiments and a\nuser study demonstrate that Parrot outperforms several baseline methods across\nvarious quality criteria, including aesthetics, human preference, image\nsentiment, and text-image alignment.", "upvotes":19 }, "publishedAt":"2024-01-12T03:09:14.995Z", "title":"Parrot: Pareto-optimal Multi-Reward Reinforcement Learning Framework for Text-to-Image Generation", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/srse2rNWA1Zd0ArtqqpoJ.png", "numComments":1 }, { "paper":{ "id":"2401.05334", "authors":[ { "_id":"659f5cfd70cf8f1cbbf90e40", "user":{ "avatarUrl":"/avatars/6c5dda9e58747054a989f077a078f3dc.svg", "isPro":false, "fullname":"Zhaoxi Chen", "user":"FrozenBurning", "type":"user" }, "name":"Zhaoxi Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:39:37.408Z", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e41", "name":"Gyeongsik Moon", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e42", "name":"Kaiwen Guo", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e43", "name":"Chen Cao", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e44", "name":"Stanislav Pidhorskyi", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e45", "name":"Tomas Simon", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e46", "name":"Rohan Joshi", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e47", "name":"Yuan Dong", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e48", "name":"Yichen Xu", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e49", "name":"Bernardo Pires", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e4a", "name":"He Wen", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e4b", "name":"Lucas Evans", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e4c", "name":"Bo Peng", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e4d", "name":"Julia Buffalini", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e4e", "name":"Autumn Trimble", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e4f", "name":"Kevyn McPhail", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e50", "name":"Melissa Schoeller", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e51", "name":"Shoou-I Yu", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e52", "name":"Javier Romero", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e53", "name":"Michael Zollhöfer", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e54", "name":"Yaser Sheikh", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e55", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1656826685333-62ab1ac1d48b4d8b048a3473.png?w=200&h=200&f=face", "isPro":false, "fullname":"Ziwei Liu", "user":"liuziwei7", "type":"user" }, "name":"Ziwei Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:39:16.197Z", "hidden":false }, { "_id":"659f5cfd70cf8f1cbbf90e56", "name":"Shunsuke Saito", "hidden":false } ], "publishedAt":"2024-01-10T18:59:51.000Z", "title":"URHand: Universal Relightable Hands", "summary":"Existing photorealistic relightable hand models require extensive\nidentity-specific observations in different views, poses, and illuminations,\nand face challenges in generalizing to natural illuminations and novel\nidentities. To bridge this gap, we present URHand, the first universal\nrelightable hand model that generalizes across viewpoints, poses,\nilluminations, and identities. Our model allows few-shot personalization using\nimages captured with a mobile phone, and is ready to be photorealistically\nrendered under novel illuminations. To simplify the personalization process\nwhile retaining photorealism, we build a powerful universal relightable prior\nbased on neural relighting from multi-view images of hands captured in a light\nstage with hundreds of identities. The key challenge is scaling the\ncross-identity training while maintaining personalized fidelity and sharp\ndetails without compromising generalization under natural illuminations. To\nthis end, we propose a spatially varying linear lighting model as the neural\nrenderer that takes physics-inspired shading as input feature. By removing\nnon-linear activations and bias, our specifically designed lighting model\nexplicitly keeps the linearity of light transport. This enables single-stage\ntraining from light-stage data while generalizing to real-time rendering under\narbitrary continuous illuminations across diverse identities. In addition, we\nintroduce the joint learning of a physically based model and our neural\nrelighting model, which further improves fidelity and generalization. Extensive\nexperiments show that our approach achieves superior performance over existing\nmethods in terms of both quality and generalizability. We also demonstrate\nquick personalization of URHand from a short phone scan of an unseen identity.", "upvotes":19 }, "publishedAt":"2024-01-11T03:14:07.837Z", "title":"URHand: Universal Relightable Hands", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/GRSnQqwcEocqHX-S6sWv6.mp4", "numComments":0 }, { "paper":{ "id":"2401.04658", "authors":[ { "_id":"659e09c4c474a955d4e840ac", "user":{ "avatarUrl":"/avatars/dfd78c8d55485c22be6e616670a633e5.svg", "isPro":false, "fullname":"zhenqin", "user":"Doreamonzzz", "type":"user" }, "name":"Zhen Qin", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:03:52.712Z", "hidden":false }, { "_id":"659e09c4c474a955d4e840ad", "user":{ "avatarUrl":"/avatars/0304a9f6eb7f5dee4d933d03222f94e9.svg", "isPro":false, "fullname":"Weigao Sun", "user":"Weigao", "type":"user" }, "name":"Weigao Sun", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:02:24.676Z", "hidden":false }, { "_id":"659e09c4c474a955d4e840ae", "user":{ "avatarUrl":"/avatars/af261b85bbd10a1182372ffc459640b8.svg", "isPro":false, "fullname":"LiDong", "user":"liddalidd", "type":"user" }, "name":"Dong Li", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:04:16.624Z", "hidden":false }, { "_id":"659e09c4c474a955d4e840af", "user":{ "avatarUrl":"/avatars/fd911e9143d1a7aedd21a7d611543fcc.svg", "isPro":false, "fullname":"Xuyang Shen", "user":"Ryan1122", "type":"user" }, "name":"Xuyang Shen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:03:01.800Z", "hidden":false }, { "_id":"659e09c4c474a955d4e840b0", "user":{ "avatarUrl":"/avatars/419e59b78f7fc7f6b3ac343398d11ec1.svg", "isPro":false, "fullname":"Weixuan Sun", "user":"weixuansun", "type":"user" }, "name":"Weixuan Sun", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:03:27.460Z", "hidden":false }, { "_id":"659e09c4c474a955d4e840b1", "user":{ "avatarUrl":"/avatars/1b108661634af602717a4ab4b66a151f.svg", "isPro":false, "fullname":"Ian Zhong", "user":"IanZhong", "type":"user" }, "name":"Yiran Zhong", "status":"claimed_verified", "statusLastChangedAt":"2024-01-10T08:38:44.072Z", "hidden":false } ], "publishedAt":"2024-01-09T16:27:28.000Z", "title":"Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence\n Lengths in Large Language Models", "summary":"Linear attention is an efficient attention mechanism that has recently\nemerged as a promising alternative to conventional softmax attention. With its\nability to process tokens in linear computational complexities, linear\nattention, in theory, can handle sequences of unlimited length without\nsacrificing speed, i.e., maintaining a constant training speed for various\nsequence lengths with a fixed memory consumption. However, due to the issue\nwith cumulative summation (cumsum), current linear attention algorithms cannot\ndemonstrate their theoretical advantage in a causal setting. In this paper, we\npresent Lightning Attention-2, the first linear attention implementation that\nenables linear attention to realize its theoretical computational benefits. To\nachieve this, we leverage the thought of tiling, separately handling the\nintra-block and inter-block components in linear attention calculation.\nSpecifically, we utilize the conventional attention computation mechanism for\nthe intra-blocks and apply linear attention kernel tricks for the inter-blocks.\nA tiling technique is adopted through both forward and backward procedures to\ntake full advantage of the GPU hardware. We implement our algorithm in Triton\nto make it IO-aware and hardware-friendly. Various experiments are conducted on\ndifferent model sizes and sequence lengths. Lightning Attention-2 retains\nconsistent training and inference speed regardless of input sequence length and\nis significantly faster than other attention mechanisms. The source code is\navailable at https://github.com/OpenNLPLab/lightning-attention.", "upvotes":19 }, "publishedAt":"2024-01-10T03:06:44.561Z", "title":"Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/za9K33hI3IwvcHme4EkIM.png", "numComments":2 }, { "paper":{ "id":"2401.03462", "authors":[ { "_id":"659cc210c80023a02e24bd54", "user":{ "avatarUrl":"/avatars/1a20dec6a1e48017f9a650ac2e510b4e.svg", "isPro":false, "fullname":"Peitian Zhang", "user":"namespace-Pt", "type":"user" }, "name":"Peitian Zhang", "status":"claimed_verified", "statusLastChangedAt":"2024-01-09T09:16:51.563Z", "hidden":false }, { "_id":"659cc210c80023a02e24bd55", "name":"Zheng Liu", "hidden":false }, { "_id":"659cc210c80023a02e24bd56", "user":{ "avatarUrl":"/avatars/c0675d05a52192ee14e9ab1633353956.svg", "isPro":false, "fullname":"Xiao", "user":"Shitao", "type":"user" }, "name":"Shitao Xiao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:22:34.399Z", "hidden":false }, { "_id":"659cc210c80023a02e24bd57", "user":{ "avatarUrl":"/avatars/877a19686dd9780129cfcb85533e2e66.svg", "isPro":false, "fullname":"Ninglu Shao", "user":"rainym00d", "type":"user" }, "name":"Ninglu Shao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:22:44.167Z", "hidden":false }, { "_id":"659cc210c80023a02e24bd58", "user":{ "avatarUrl":"/avatars/d11f607c496b2ccf3ffe439628711abb.svg", "isPro":false, "fullname":"Qiwei Ye", "user":"aeros0ul", "type":"user" }, "name":"Qiwei Ye", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:22:51.963Z", "hidden":false }, { "_id":"659cc210c80023a02e24bd59", "name":"Zhicheng Dou", "hidden":false } ], "publishedAt":"2024-01-07T11:57:40.000Z", "title":"Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon", "summary":"The utilization of long contexts poses a big challenge for large language\nmodels due to their limited context window length. Although the context window\ncan be extended through fine-tuning, it will result in a considerable cost at\nboth training and inference time, and exert an unfavorable impact to the LLM's\noriginal capabilities. In this work, we propose Activation Beacon, which\ncondenses LLM's raw activations into more compact forms such that it can\nperceive a much longer context with a limited context window. Activation Beacon\nis introduced as a plug-and-play module for the LLM. It fully preserves the\nLLM's original capability on short contexts while extending the new capability\non processing longer contexts. Besides, it works with short sliding windows to\nprocess the long context, which achieves a competitive memory and time\nefficiency in both training and inference. Activation Beacon is learned by the\nauto-regression task conditioned on a mixture of beacons with diversified\ncondensing ratios. Thanks to such a treatment, it can be efficiently trained\npurely with short-sequence data in just 10K steps, which consumes less than 9\nhours on a single 8xA800 GPU machine. The experimental studies show that\nActivation Beacon is able to extend Llama-2-7B's context length by times100\ntimes (from 4K to 400K), meanwhile achieving a superior result on both\nlong-context generation and understanding tasks. Our model and code will be\navailable at the BGE repository.", "upvotes":19 }, "publishedAt":"2024-01-09T03:48:33.134Z", "title":"Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/oj1jmJ60kzxVSNSEaaGGK.png", "numComments":0 }, { "paper":{ "id":"2401.06003", "authors":[ { "_id":"65a0b0200c8d993b1719484a", "user":{ "avatarUrl":"/avatars/30c737a333d823a139183ef3ebdb252a.svg", "isPro":false, "fullname":"Linus Franke", "user":"linusfranke", "type":"user" }, "name":"Linus Franke", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T14:53:29.998Z", "hidden":false }, { "_id":"65a0b0200c8d993b1719484b", "name":"Darius Rückert", "hidden":false }, { "_id":"65a0b0200c8d993b1719484c", "name":"Laura Fink", "hidden":true }, { "_id":"65a0b0200c8d993b1719484d", "name":"Marc Stamminger", "hidden":false } ], "publishedAt":"2024-01-11T16:06:36.000Z", "title":"TRIPS: Trilinear Point Splatting for Real-Time Radiance Field Rendering", "summary":"Point-based radiance field rendering has demonstrated impressive results for\nnovel view synthesis, offering a compelling blend of rendering quality and\ncomputational efficiency. However, also latest approaches in this domain are\nnot without their shortcomings. 3D Gaussian Splatting [Kerbl and Kopanas et al.\n2023] struggles when tasked with rendering highly detailed scenes, due to\nblurring and cloudy artifacts. On the other hand, ADOP [R\\\"uckert et al. 2022]\ncan accommodate crisper images, but the neural reconstruction network decreases\nperformance, it grapples with temporal instability and it is unable to\neffectively address large gaps in the point cloud.\n In this paper, we present TRIPS (Trilinear Point Splatting), an approach that\ncombines ideas from both Gaussian Splatting and ADOP. The fundamental concept\nbehind our novel technique involves rasterizing points into a screen-space\nimage pyramid, with the selection of the pyramid layer determined by the\nprojected point size. This approach allows rendering arbitrarily large points\nusing a single trilinear write. A lightweight neural network is then used to\nreconstruct a hole-free image including detail beyond splat resolution.\nImportantly, our render pipeline is entirely differentiable, allowing for\nautomatic optimization of both point sizes and positions.\n Our evaluation demonstrate that TRIPS surpasses existing state-of-the-art\nmethods in terms of rendering quality while maintaining a real-time frame rate\nof 60 frames per second on readily available hardware. This performance extends\nto challenging scenarios, such as scenes featuring intricate geometry,\nexpansive landscapes, and auto-exposed footage.", "upvotes":18 }, "publishedAt":"2024-01-12T03:21:07.953Z", "title":"TRIPS: Trilinear Point Splatting for Real-Time Radiance Field Rendering", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/KAd7LmMoGQBPx5-2cj22I.mp4", "numComments":0 }, { "paper":{ "id":"2401.06080", "authors":[ { "_id":"65a0a832ff65c2d46da034b6", "user":{ "avatarUrl":"/avatars/5bb3fac9a8fe849aac263b328f7f3c11.svg", "isPro":false, "fullname":"Binghai Wang", "user":"wybiacx", "type":"user" }, "name":"Binghai Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:22:55.860Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034b7", "name":"Rui Zheng", "hidden":false }, { "_id":"65a0a832ff65c2d46da034b8", "name":"Lu Chen", "hidden":false }, { "_id":"65a0a832ff65c2d46da034b9", "name":"Yan Liu", "hidden":false }, { "_id":"65a0a832ff65c2d46da034ba", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/uVXJd3TBssT_Ug4yAl3c9.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Shihan Dou", "user":"Ablustrund", "type":"user" }, "name":"Shihan Dou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:23:45.513Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034bb", "name":"Caishuang Huang", "hidden":false }, { "_id":"65a0a832ff65c2d46da034bc", "name":"Wei Shen", "hidden":false }, { "_id":"65a0a832ff65c2d46da034bd", "user":{ "avatarUrl":"/avatars/8199d6181794b8feee8fe9df73267af2.svg", "isPro":false, "fullname":"jinsenjie", "user":"yizhidalaohu", "type":"user" }, "name":"Senjie Jin", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:24:17.457Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034be", "user":{ "avatarUrl":"/avatars/409c914c827282f4276d3500561fabf3.svg", "isPro":false, "fullname":"enyu", "user":"zhouenyu04", "type":"user" }, "name":"Enyu Zhou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:24:30.372Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034bf", "user":{ "avatarUrl":"/avatars/9a39ed04ca60da503a4660a942ebb798.svg", "isPro":false, "fullname":"Chenyu Shi", "user":"Chenyu2", "type":"user" }, "name":"Chenyu Shi", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:24:38.589Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c0", "name":"Songyang Gao", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c1", "name":"Nuo Xu", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c2", "user":{ "avatarUrl":"/avatars/8e73f91b35ac0d11671fd7399a43e723.svg", "isPro":false, "fullname":"Yuhao Zhou", "user":"YuhaoZhou", "type":"user" }, "name":"Yuhao Zhou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:25:28.842Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c3", "user":{ "avatarUrl":"/avatars/cff4f1535632e7c8abc33fe7b862f60d.svg", "isPro":false, "fullname":"Xiaoran Fan", "user":"cnxup", "type":"user" }, "name":"Xiaoran Fan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:25:36.148Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c4", "user":{ "avatarUrl":"/avatars/a29a226664776927b15f1174b397ed75.svg", "isPro":false, "fullname":"Zhiheng Xi", "user":"WooooDyy", "type":"user" }, "name":"Zhiheng Xi", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:25:42.948Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c5", "name":"Jun Zhao", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c6", "name":"Xiao Wang", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c7", "name":"Tao Ji", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c8", "name":"Hang Yan", "hidden":false }, { "_id":"65a0a832ff65c2d46da034c9", "name":"Lixing Shen", "hidden":false }, { "_id":"65a0a832ff65c2d46da034ca", "name":"Zhan Chen", "hidden":false }, { "_id":"65a0a832ff65c2d46da034cb", "name":"Tao Gui", "hidden":false }, { "_id":"65a0a832ff65c2d46da034cc", "name":"Qi Zhang", "hidden":false }, { "_id":"65a0a832ff65c2d46da034cd", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1632381702899-61457b8deff2c9fdb4de4988.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Xipeng Qiu", "user":"xpqiu", "type":"user" }, "name":"Xipeng Qiu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:26:22.800Z", "hidden":false }, { "_id":"65a0a832ff65c2d46da034ce", "name":"Xuanjing Huang", "hidden":false }, { "_id":"65a0a832ff65c2d46da034cf", "name":"Zuxuan Wu", "hidden":false }, { "_id":"65a0a832ff65c2d46da034d0", "name":"Yu-Gang Jiang", "hidden":false } ], "publishedAt":"2024-01-11T17:56:59.000Z", "title":"Secrets of RLHF in Large Language Models Part II: Reward Modeling", "summary":"Reinforcement Learning from Human Feedback (RLHF) has become a crucial\ntechnology for aligning language models with human values and intentions,\nenabling models to produce more helpful and harmless responses. Reward models\nare trained as proxies for human preferences to drive reinforcement learning\noptimization. While reward models are often considered central to achieving\nhigh performance, they face the following challenges in practical applications:\n(1) Incorrect and ambiguous preference pairs in the dataset may hinder the\nreward model from accurately capturing human intent. (2) Reward models trained\non data from a specific distribution often struggle to generalize to examples\noutside that distribution and are not suitable for iterative RLHF training.\n In this report, we attempt to address these two issues. (1) From a data\nperspective, we propose a method to measure the strength of preferences within\nthe data, based on a voting mechanism of multiple reward models. Experimental\nresults confirm that data with varying preference strengths have different\nimpacts on reward model performance. We introduce a series of novel methods to\nmitigate the influence of incorrect and ambiguous preferences in the dataset\nand fully leverage high-quality preference data. (2) From an algorithmic\nstandpoint, we introduce contrastive learning to enhance the ability of reward\nmodels to distinguish between chosen and rejected responses, thereby improving\nmodel generalization. Furthermore, we employ meta-learning to enable the reward\nmodel to maintain the ability to differentiate subtle differences in\nout-of-distribution samples, and this approach can be utilized for iterative\nRLHF optimization.", "upvotes":18 }, "publishedAt":"2024-01-12T02:47:15.474Z", "title":"Secrets of RLHF in Large Language Models Part II: Reward Modeling", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/t6RdI5ayxVYgL-AlJCFyv.png", "numComments":0 }, { "paper":{ "id":"2401.05566", "authors":[ { "_id":"65a0a37f9e18386f471f3f7d", "name":"Evan Hubinger", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f7e", "name":"Carson Denison", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f7f", "user":{ "avatarUrl":"/avatars/cd3c8a97823e3cbc176fef245113624f.svg", "isPro":false, "fullname":"Jesse Mu", "user":"jayelm", "type":"user" }, "name":"Jesse Mu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:32:12.408Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f80", "user":{ "avatarUrl":"/avatars/1d71b80340b11a46e70cab86590b459e.svg", "isPro":false, "fullname":"Mike Lambert", "user":"MikeLambert", "type":"user" }, "name":"Mike Lambert", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:32:36.618Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f81", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/63272a638624baac667c8bdb/ylZ-FNT9PLhn8sBCD1wQm.png?w=200&h=200&f=face", "isPro":false, "fullname":"Meg Tong", "user":"meg-tong", "type":"user" }, "name":"Meg Tong", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:32:43.288Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f82", "name":"Monte MacDiarmid", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f83", "name":"Tamera Lanham", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f84", "name":"Daniel M. Ziegler", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f85", "user":{ "avatarUrl":"/avatars/792d426a6c306c41fa0818e63e7aeefb.svg", "isPro":false, "fullname":"Tim Maxwell", "user":"maxwellinked", "type":"user" }, "name":"Tim Maxwell", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:34:04.934Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f86", "user":{ "avatarUrl":"/avatars/2c6f84c6ecb71c047c5d043d5a89f791.svg", "isPro":false, "fullname":"Newton Cheng", "user":"nccheng", "type":"user" }, "name":"Newton Cheng", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:34:21.109Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f87", "name":"Adam Jermyn", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f88", "name":"Amanda Askell", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f89", "user":{ "avatarUrl":"/avatars/a05c5e3b964ca2a6b82eec0e3d89c064.svg", "isPro":false, "fullname":"Ansh Radhakrishnan", "user":"anshr", "type":"user" }, "name":"Ansh Radhakrishnan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:34:37.230Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f8a", "user":{ "avatarUrl":"/avatars/14d0075aa1b578cd7ee5f9e68d12e2f0.svg", "isPro":false, "fullname":"Cem Anil", "user":"anilcem", "type":"user" }, "name":"Cem Anil", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:34:44.023Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f8b", "name":"David Duvenaud", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f8c", "user":{ "avatarUrl":"/avatars/f8219c475af6995b6fb9ebe727c5be43.svg", "isPro":false, "fullname":"Deep Ganguli", "user":"dganguli", "type":"user" }, "name":"Deep Ganguli", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:34:55.651Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f8d", "name":"Fazl Barez", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f8e", "name":"Jack Clark", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f8f", "name":"Kamal Ndousse", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f90", "user":{ "avatarUrl":"/avatars/6448e205712ad2375fbba79a32cb62a1.svg", "isPro":false, "fullname":"Kshitij Sachan", "user":"ksachan", "type":"user" }, "name":"Kshitij Sachan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:35:43.559Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f91", "name":"Michael Sellitto", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f92", "name":"Mrinank Sharma", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f93", "name":"Nova DasSarma", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f94", "name":"Roger Grosse", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f95", "user":{ "avatarUrl":"/avatars/1058a4d9c30ac45fb036df951a44a270.svg", "isPro":false, "fullname":"Shauna Kravec", "user":"smkravec", "type":"user" }, "name":"Shauna Kravec", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:36:11.235Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f96", "user":{ "avatarUrl":"/avatars/6ec793667e225616ffbb69ae7d57566e.svg", "isPro":false, "fullname":"bai", "user":"baiyuntao", "type":"user" }, "name":"Yuntao Bai", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:36:24.369Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f97", "name":"Zachary Witten", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f98", "name":"Marina Favaro", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f99", "name":"Jan Brauner", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f9a", "name":"Holden Karnofsky", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f9b", "name":"Paul Christiano", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f9c", "name":"Samuel R. Bowman", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f9d", "user":{ "avatarUrl":"/avatars/385427fe5fd7043abcc814ef50db6f4a.svg", "isPro":false, "fullname":"Logan Graham", "user":"logangraham", "type":"user" }, "name":"Logan Graham", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:37:01.310Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f9e", "user":{ "avatarUrl":"/avatars/e2f315a8d6241ae1a783ba2957a211a5.svg", "isPro":false, "fullname":"Jared Kaplan", "user":"FrizzleFried", "type":"user" }, "name":"Jared Kaplan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:37:09.365Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3f9f", "name":"Sören Mindermann", "hidden":false }, { "_id":"65a0a37f9e18386f471f3fa0", "user":{ "avatarUrl":"/avatars/d5dfd82040f22d58c8793676d3f76cd4.svg", "isPro":false, "fullname":"Ryan Greenblatt", "user":"ryan-greenblatt", "type":"user" }, "name":"Ryan Greenblatt", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:37:23.118Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3fa1", "user":{ "avatarUrl":"/avatars/276d2edcaa42c5378965c662c6feb9e5.svg", "isPro":false, "fullname":"Buck Shlegeris", "user":"bshlgrs", "type":"user" }, "name":"Buck Shlegeris", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:37:30.012Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3fa2", "user":{ "avatarUrl":"/avatars/9ac4f3d01c293ce97507be402dfa688c.svg", "isPro":false, "fullname":"Nicholas Schiefer", "user":"nschiefer", "type":"user" }, "name":"Nicholas Schiefer", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:37:37.371Z", "hidden":false }, { "_id":"65a0a37f9e18386f471f3fa3", "user":{ "avatarUrl":"/avatars/8af702e49a639f0d1574745b0ab25c75.svg", "isPro":false, "fullname":"Ethan Perez", "user":"EthanAraragi", "type":"user" }, "name":"Ethan Perez", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:37:58.725Z", "hidden":false } ], "publishedAt":"2024-01-10T22:14:35.000Z", "title":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety\n Training", "summary":"Humans are capable of strategically deceptive behavior: behaving helpfully in\nmost situations, but then behaving very differently in order to pursue\nalternative objectives when given the opportunity. If an AI system learned such\na deceptive strategy, could we detect it and remove it using current\nstate-of-the-art safety training techniques? To study this question, we\nconstruct proof-of-concept examples of deceptive behavior in large language\nmodels (LLMs). For example, we train models that write secure code when the\nprompt states that the year is 2023, but insert exploitable code when the\nstated year is 2024. We find that such backdoored behavior can be made\npersistent, so that it is not removed by standard safety training techniques,\nincluding supervised fine-tuning, reinforcement learning, and adversarial\ntraining (eliciting unsafe behavior and then training to remove it). The\nbackdoored behavior is most persistent in the largest models and in models\ntrained to produce chain-of-thought reasoning about deceiving the training\nprocess, with the persistence remaining even when the chain-of-thought is\ndistilled away. Furthermore, rather than removing backdoors, we find that\nadversarial training can teach models to better recognize their backdoor\ntriggers, effectively hiding the unsafe behavior. Our results suggest that,\nonce a model exhibits deceptive behavior, standard techniques could fail to\nremove such deception and create a false impression of safety.", "upvotes":16 }, "publishedAt":"2024-01-12T02:27:13.301Z", "title":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/41in4Gi4-aKzZswR8kQMY.png", "numComments":0 }, { "paper":{ "id":"2401.04718", "authors":[ { "_id":"659e2024962de309f03e330a", "user":{ "avatarUrl":"/avatars/c8c17689ed88b1966d2a36361e3f83cc.svg", "isPro":false, "fullname":"Xiaojuan Wang", "user":"xiaojwan", "type":"user" }, "name":"Xiaojuan Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:53:46.072Z", "hidden":false }, { "_id":"659e2024962de309f03e330b", "user":{ "avatarUrl":"/avatars/8264de115e6305e4720c78fc7dce7f8d.svg", "isPro":false, "fullname":"Taesung Park", "user":"taesungp", "type":"user" }, "name":"Taesung Park", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T08:54:14.489Z", "hidden":false }, { "_id":"659e2024962de309f03e330c", "name":"Yang Zhou", "hidden":false }, { "_id":"659e2024962de309f03e330d", "name":"Eli Shechtman", "hidden":false }, { "_id":"659e2024962de309f03e330e", "name":"Richard Zhang", "hidden":false } ], "publishedAt":"2024-01-09T18:44:48.000Z", "title":"Jump Cut Smoothing for Talking Heads", "summary":"A jump cut offers an abrupt, sometimes unwanted change in the viewing\nexperience. We present a novel framework for smoothing these jump cuts, in the\ncontext of talking head videos. We leverage the appearance of the subject from\nthe other source frames in the video, fusing it with a mid-level representation\ndriven by DensePose keypoints and face landmarks. To achieve motion, we\ninterpolate the keypoints and landmarks between the end frames around the cut.\nWe then use an image translation network from the keypoints and source frames,\nto synthesize pixels. Because keypoints can contain errors, we propose a\ncross-modal attention scheme to select and pick the most appropriate source\namongst multiple options for each key point. By leveraging this mid-level\nrepresentation, our method can achieve stronger results than a strong video\ninterpolation baseline. We demonstrate our method on various jump cuts in the\ntalking head videos, such as cutting filler words, pauses, and even random\ncuts. Our experiments show that we can achieve seamless transitions, even in\nthe challenging cases where the talking head rotates or moves drastically in\nthe jump cut.", "upvotes":15 }, "publishedAt":"2024-01-10T04:42:14.426Z", "title":"Jump Cut Smoothing for Talking Heads", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/h9VVa3NBDUNq_kd9QhYsz.mp4", "numComments":0 }, { "paper":{ "id":"2401.08541", "authors":[ { "_id":"65a7733cc26011a4a7ef8b3b", "user":{ "avatarUrl":"/avatars/8160ab72b0dd2fd468fa48289ffcc36e.svg", "isPro":false, "fullname":"Alaa El-Nouby", "user":"alaaelnouby", "type":"user" }, "name":"Alaaeldin El-Nouby", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:13:15.376Z", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b3c", "user":{ "avatarUrl":"/avatars/e2884cf99771247408f0a2f0edf58c92.svg", "isPro":false, "fullname":"Michal Klein", "user":"michalk8", "type":"user" }, "name":"Michal Klein", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:00:10.583Z", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b3d", "name":"Shuangfei Zhai", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b3e", "user":{ "avatarUrl":"/avatars/b98a1a5d07db426934106d40626995a5.svg", "isPro":false, "fullname":"Miguel Angel Bautista", "user":"mbautistamartin", "type":"user" }, "name":"Miguel Angel Bautista", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:00:34.844Z", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b3f", "user":{ "avatarUrl":"/avatars/c9507d6ec2b81681461837c8261703c7.svg", "isPro":false, "fullname":"Alexander Toshev", "user":"toshev", "type":"user" }, "name":"Alexander Toshev", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:00:41.730Z", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b40", "user":{ "avatarUrl":"/avatars/0dfbd5d7f1111ec8798e3a9bcb5f7e77.svg", "isPro":false, "fullname":"vaishaal shankar", "user":"vaishaal", "type":"user" }, "name":"Vaishaal Shankar", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:00:48.497Z", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b41", "user":{ "avatarUrl":"/avatars/52c5eca12499a1aa9bd49c43d4f20685.svg", "isPro":false, "fullname":"Joshua M. Susskind", "user":"jsusskind", "type":"user" }, "name":"Joshua M Susskind", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:00:56.399Z", "hidden":false }, { "_id":"65a7733cc26011a4a7ef8b42", "name":"Armand Joulin", "hidden":false } ], "publishedAt":"2024-01-16T18:03:37.000Z", "title":"Scalable Pre-training of Large Autoregressive Image Models", "summary":"This paper introduces AIM, a collection of vision models pre-trained with an\nautoregressive objective. These models are inspired by their textual\ncounterparts, i.e., Large Language Models (LLMs), and exhibit similar scaling\nproperties. Specifically, we highlight two key findings: (1) the performance of\nthe visual features scale with both the model capacity and the quantity of\ndata, (2) the value of the objective function correlates with the performance\nof the model on downstream tasks. We illustrate the practical implication of\nthese findings by pre-training a 7 billion parameter AIM on 2 billion images,\nthat achieves 84.0% on ImageNet-1k with a frozen trunk. Interestingly, even at\nthis scale, we observe no sign of saturation in performance, suggesting that\nAIM potentially represents a new frontier for training large-scale vision\nmodels. The pre-training of AIM is similar to the pre-training of LLMs, and\ndoes not require any image-specific strategy to stabilize the training at\nscale.", "upvotes":14 }, "publishedAt":"2024-01-17T06:27:09.153Z", "title":"Scalable Pre-training of Large Autoregressive Image Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/agQVTT_BcFBvRnhUcBL4R.png", "numComments":1 }, { "paper":{ "id":"2401.06102", "authors":[ { "_id":"65a0a8ae33d5d9ca30dbb594", "user":{ "avatarUrl":"/avatars/1685b6b1f2e9329d92b53e1070d8c7c4.svg", "isPro":false, "fullname":"AG", "user":"asmadotgh", "type":"user" }, "name":"Asma Ghandeharioun", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T08:06:29.260Z", "hidden":false }, { "_id":"65a0a8ae33d5d9ca30dbb595", "user":{ "avatarUrl":"/avatars/1c0dfc1fe7b62a78b76ec4f9e6b40b24.svg", "isPro":false, "fullname":"Avi Caciularu", "user":"codevan", "type":"user" }, "name":"Avi Caciularu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:38:58.246Z", "hidden":false }, { "_id":"65a0a8ae33d5d9ca30dbb596", "user":{ "avatarUrl":"/avatars/78310b60a2cd461a4b980a339a677c7c.svg", "isPro":false, "fullname":"Adam Pearce", "user":"1wheel", "type":"user" }, "name":"Adam Pearce", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:40:17.136Z", "hidden":false }, { "_id":"65a0a8ae33d5d9ca30dbb597", "user":{ "avatarUrl":"/avatars/640a06bb5767daa6d17f9ee000bb8900.svg", "isPro":false, "fullname":"Lucas Dixon", "user":"iislucas", "type":"user" }, "name":"Lucas Dixon", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T09:40:23.035Z", "hidden":false }, { "_id":"65a0a8ae33d5d9ca30dbb598", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1628140189042-noauth.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Mor Geva", "user":"mega", "type":"user" }, "name":"Mor Geva", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T08:06:25.609Z", "hidden":false } ], "publishedAt":"2024-01-11T18:33:48.000Z", "title":"Patchscope: A Unifying Framework for Inspecting Hidden Representations\n of Language Models", "summary":"Inspecting the information encoded in hidden representations of large\nlanguage models (LLMs) can explain models' behavior and verify their alignment\nwith human values. Given the capabilities of LLMs in generating\nhuman-understandable text, we propose leveraging the model itself to explain\nits internal representations in natural language. We introduce a framework\ncalled Patchscopes and show how it can be used to answer a wide range of\nresearch questions about an LLM's computation. We show that prior\ninterpretability methods based on projecting representations into the\nvocabulary space and intervening on the LLM computation, can be viewed as\nspecial instances of this framework. Moreover, several of their shortcomings\nsuch as failure in inspecting early layers or lack of expressivity can be\nmitigated by a Patchscope. Beyond unifying prior inspection techniques,\nPatchscopes also opens up new possibilities such as using a more capable model\nto explain the representations of a smaller model, and unlocks new applications\nsuch as self-correction in multi-hop reasoning.", "upvotes":14 }, "publishedAt":"2024-01-12T02:49:19.065Z", "title":"Patchscope: A Unifying Framework for Inspecting Hidden Representations of Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/eVgaUfR3krVVYKRzWbaA8.png", "numComments":0 }, { "paper":{ "id":"2401.06121", "authors":[ { "_id":"65a0ac08b90a1eb50fe0afff", "user":{ "avatarUrl":"/avatars/defca406f2c4f027a7b1be9a5ebbbd01.svg", "isPro":false, "fullname":"Pratyush Maini", "user":"pratyushmaini", "type":"user" }, "name":"Pratyush Maini", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:02:37.531Z", "hidden":false }, { "_id":"65a0ac08b90a1eb50fe0b000", "name":"Zhili Feng", "hidden":false }, { "_id":"65a0ac08b90a1eb50fe0b001", "user":{ "avatarUrl":"/avatars/529e9713e6ac835e11599ea7070a9603.svg", "isPro":false, "fullname":"Avi Schwarzschild", "user":"schwarzschild", "type":"user" }, "name":"Avi Schwarzschild", "status":"extracted_confirmed", "statusLastChangedAt":"2024-01-12T12:37:40.724Z", "hidden":false }, { "_id":"65a0ac08b90a1eb50fe0b002", "name":"Zachary C. Lipton", "hidden":false }, { "_id":"65a0ac08b90a1eb50fe0b003", "user":{ "avatarUrl":"/avatars/6d145d7e225b16bde64680ba8c05f856.svg", "isPro":true, "fullname":"Zico Kolter", "user":"zkolter", "type":"user" }, "name":"J. Zico Kolter", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:03:33.415Z", "hidden":false } ], "publishedAt":"2024-01-11T18:57:12.000Z", "title":"TOFU: A Task of Fictitious Unlearning for LLMs", "summary":"Large language models trained on massive corpora of data from the web can\nmemorize and reproduce sensitive or private data raising both legal and ethical\nconcerns. Unlearning, or tuning models to forget information present in their\ntraining data, provides us with a way to protect private data after training.\nAlthough several methods exist for such unlearning, it is unclear to what\nextent they result in models equivalent to those where the data to be forgotten\nwas never learned in the first place. To address this challenge, we present\nTOFU, a Task of Fictitious Unlearning, as a benchmark aimed at helping deepen\nour understanding of unlearning. We offer a dataset of 200 diverse synthetic\nauthor profiles, each consisting of 20 question-answer pairs, and a subset of\nthese profiles called the forget set that serves as the target for unlearning.\nWe compile a suite of metrics that work together to provide a holistic picture\nof unlearning efficacy. Finally, we provide a set of baseline results from\nexisting unlearning algorithms. Importantly, none of the baselines we consider\nshow effective unlearning motivating continued efforts to develop approaches\nfor unlearning that effectively tune models so that they truly behave as if\nthey were never trained on the forget data at all.", "upvotes":13 }, "publishedAt":"2024-01-12T03:03:36.869Z", "title":"TOFU: A Task of Fictitious Unlearning for LLMs", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/saRF-HF-97bCg64554hKJ.png", "numComments":0 }, { "paper":{ "id":"2401.05654", "authors":[ { "_id":"65a0a488ff65c2d46d9efe22", "name":"Tao Tu", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe23", "name":"Anil Palepu", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe24", "user":{ "avatarUrl":"/avatars/b7750f5d6281f906cfa5c717a44e2e13.svg", "isPro":false, "fullname":"Mike Schaekermann", "user":"mikeschaekermann", "type":"user" }, "name":"Mike Schaekermann", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:09:36.391Z", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe25", "name":"Khaled Saab", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe26", "user":{ "avatarUrl":"/avatars/91245f3b6a2eccebef872ed00bdfea73.svg", "isPro":false, "fullname":"Jan Freyberg", "user":"jfreyberg", "type":"user" }, "name":"Jan Freyberg", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:09:47.258Z", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe27", "name":"Ryutaro Tanno", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe28", "name":"Amy Wang", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe29", "name":"Brenna Li", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe2a", "name":"Mohamed Amin", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe2b", "name":"Nenad Tomasev", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe2c", "name":"Shekoofeh Azizi", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe2d", "user":{ "avatarUrl":"/avatars/7ad4a33503533ddbb54c71b634fda89b.svg", "isPro":false, "fullname":"Karan Singhal", "user":"karan1149", "type":"user" }, "name":"Karan Singhal", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:10:19.355Z", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe2e", "name":"Yong Cheng", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe2f", "user":{ "avatarUrl":"/avatars/a119644422335e260af0ca4ef6399870.svg", "isPro":false, "fullname":"Le Hou", "user":"solooobear", "type":"user" }, "name":"Le Hou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:10:44.869Z", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe30", "user":{ "avatarUrl":"/avatars/f14c59fd56a49dd9a925285a4fb66fff.svg", "isPro":false, "fullname":"Albert Webson", "user":"awebson", "type":"user" }, "name":"Albert Webson", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:10:52.013Z", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe31", "name":"Kavita Kulkarni", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe32", "name":"S Sara Mahdavi", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe33", "name":"Christopher Semturs", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe34", "user":{ "avatarUrl":"/avatars/f4e047e82e41ec507c6f51aef7560d87.svg", "isPro":false, "fullname":"Juraj Gottweis", "user":"juro", "type":"user" }, "name":"Juraj Gottweis", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:11:10.554Z", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe35", "name":"Joelle Barral", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe36", "name":"Katherine Chou", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe37", "name":"Greg S Corrado", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe38", "name":"Yossi Matias", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe39", "name":"Alan Karthikesalingam", "hidden":false }, { "_id":"65a0a488ff65c2d46d9efe3a", "user":{ "avatarUrl":"/avatars/a389671494782f06ff8502c934090cc0.svg", "isPro":false, "fullname":"Vivek Natarajan", "user":"vivnat", "type":"user" }, "name":"Vivek Natarajan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:11:37.514Z", "hidden":false } ], "publishedAt":"2024-01-11T04:25:06.000Z", "title":"Towards Conversational Diagnostic AI", "summary":"At the heart of medicine lies the physician-patient dialogue, where skillful\nhistory-taking paves the way for accurate diagnosis, effective management, and\nenduring trust. Artificial Intelligence (AI) systems capable of diagnostic\ndialogue could increase accessibility, consistency, and quality of care.\nHowever, approximating clinicians' expertise is an outstanding grand challenge.\nHere, we introduce AMIE (Articulate Medical Intelligence Explorer), a Large\nLanguage Model (LLM) based AI system optimized for diagnostic dialogue.\n AMIE uses a novel self-play based simulated environment with automated\nfeedback mechanisms for scaling learning across diverse disease conditions,\nspecialties, and contexts. We designed a framework for evaluating\nclinically-meaningful axes of performance including history-taking, diagnostic\naccuracy, management reasoning, communication skills, and empathy. We compared\nAMIE's performance to that of primary care physicians (PCPs) in a randomized,\ndouble-blind crossover study of text-based consultations with validated patient\nactors in the style of an Objective Structured Clinical Examination (OSCE). The\nstudy included 149 case scenarios from clinical providers in Canada, the UK,\nand India, 20 PCPs for comparison with AMIE, and evaluations by specialist\nphysicians and patient actors. AMIE demonstrated greater diagnostic accuracy\nand superior performance on 28 of 32 axes according to specialist physicians\nand 24 of 26 axes according to patient actors. Our research has several\nlimitations and should be interpreted with appropriate caution. Clinicians were\nlimited to unfamiliar synchronous text-chat which permits large-scale\nLLM-patient interactions but is not representative of usual clinical practice.\nWhile further research is required before AMIE could be translated to\nreal-world settings, the results represent a milestone towards conversational\ndiagnostic AI.", "upvotes":13 }, "publishedAt":"2024-01-12T02:31:37.944Z", "title":"Towards Conversational Diagnostic AI", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/UpE3gkZBR0U21M_i_DuJY.png", "numComments":0 }, { "paper":{ "id":"2401.04575", "authors":[ { "_id":"659e0d65e6fe8ef6195521f4", "user":{ "avatarUrl":"/avatars/0c11bc0da4776bb7097eafbebcdf89d3.svg", "isPro":false, "fullname":"Yatong Bai", "user":"29Ar3d10", "type":"user" }, "name":"Yatong Bai", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:06:25.655Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521f5", "user":{ "avatarUrl":"/avatars/5334e8be63fc43bd3351af4fe5935e69.svg", "isPro":false, "fullname":"Utsav Garg", "user":"utsavgarg", "type":"user" }, "name":"Utsav Garg", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:06:32.495Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521f6", "user":{ "avatarUrl":"/avatars/0ef388cf224bfedde279f5cb2b7bf592.svg", "isPro":false, "fullname":"Apaar Shanker", "user":"ashanker9", "type":"user" }, "name":"Apaar Shanker", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:06:38.711Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521f7", "user":{ "avatarUrl":"/avatars/5c657d72ef6c59f182b2c5113f1c0e79.svg", "isPro":false, "fullname":"ZhangHaoMing", "user":"Blackfirezhm", "type":"user" }, "name":"Haoming Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:06:56.015Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521f8", "user":{ "avatarUrl":"/avatars/ff5709d81010d8980f637775d92a71a5.svg", "isPro":false, "fullname":"Samyak Parajuli", "user":"samp830", "type":"user" }, "name":"Samyak Parajuli", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:07:04.605Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521f9", "user":{ "avatarUrl":"/avatars/f22fd2f02d4163f0a620727d9b79cc4a.svg", "isPro":false, "fullname":"Erhan", "user":"erhanbas", "type":"user" }, "name":"Erhan Bas", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:18:55.737Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521fa", "name":"Isidora Filipovic", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521fb", "user":{ "avatarUrl":"/avatars/44d22b8b6af085cf4fa2dec338232f2a.svg", "isPro":false, "fullname":"Amelia Chu", "user":"ameliachu", "type":"user" }, "name":"Amelia N. Chu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:19:30.619Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521fc", "name":"Eugenia D Fomitcheva", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521fd", "user":{ "avatarUrl":"/avatars/0ec3943018237bf407373030f433bea4.svg", "isPro":false, "fullname":"Elliot Branson", "user":"ElliotBranson", "type":"user" }, "name":"Elliot Branson", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:19:46.659Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521fe", "user":{ "avatarUrl":"/avatars/8e77e65091fac85857f36c48233e875e.svg", "isPro":false, "fullname":"aerin kim", "user":"aerinkim", "type":"user" }, "name":"Aerin Kim", "status":"extracted_confirmed", "statusLastChangedAt":"2024-01-10T03:27:05.979Z", "hidden":false }, { "_id":"659e0d65e6fe8ef6195521ff", "name":"Somayeh Sojoudi", "hidden":false }, { "_id":"659e0d65e6fe8ef619552200", "user":{ "avatarUrl":"/avatars/29080f550fdb471c4d2daace86aff3b0.svg", "isPro":false, "fullname":"Kyunghyun Cho", "user":"kyunghyuncho", "type":"user" }, "name":"Kyunghyun Cho", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:20:04.684Z", "hidden":false } ], "publishedAt":"2024-01-09T14:24:29.000Z", "title":"Let's Go Shopping (LGS) -- Web-Scale Image-Text Dataset for Visual\n Concept Understanding", "summary":"Vision and vision-language applications of neural networks, such as image\nclassification and captioning, rely on large-scale annotated datasets that\nrequire non-trivial data-collecting processes. This time-consuming endeavor\nhinders the emergence of large-scale datasets, limiting researchers and\npractitioners to a small number of choices. Therefore, we seek more efficient\nways to collect and annotate images. Previous initiatives have gathered\ncaptions from HTML alt-texts and crawled social media postings, but these data\nsources suffer from noise, sparsity, or subjectivity. For this reason, we turn\nto commercial shopping websites whose data meet three criteria: cleanliness,\ninformativeness, and fluency. We introduce the Let's Go Shopping (LGS) dataset,\na large-scale public dataset with 15 million image-caption pairs from publicly\navailable e-commerce websites. When compared with existing general-domain\ndatasets, the LGS images focus on the foreground object and have less complex\nbackgrounds. Our experiments on LGS show that the classifiers trained on\nexisting benchmark datasets do not readily generalize to e-commerce data, while\nspecific self-supervised visual feature extractors can better generalize.\nFurthermore, LGS's high-quality e-commerce-focused images and bimodal nature\nmake it advantageous for vision-language bi-modal tasks: LGS enables\nimage-captioning models to generate richer captions and helps text-to-image\ngeneration models achieve e-commerce style transfer.", "upvotes":13 }, "publishedAt":"2024-01-10T03:22:18.878Z", "title":"Let's Go Shopping (LGS) -- Web-Scale Image-Text Dataset for Visual Concept Understanding", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/s7sLVear4Tp9gMaCiU6Cs.png", "numComments":2 }, { "paper":{ "id":"2401.04398", "authors":[ { "_id":"659e0823b03a0f0e4f5359eb", "user":{ "avatarUrl":"/avatars/aa4ef471069cea0dffc8c7e4ee61a2fd.svg", "isPro":false, "fullname":"Zilong Wang", "user":"zilongwang", "type":"user" }, "name":"Zilong Wang", "status":"claimed_verified", "statusLastChangedAt":"2024-01-10T08:38:46.368Z", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359ec", "name":"Hao Zhang", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359ed", "name":"Chun-Liang Li", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359ee", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/640f5502a92fedb0e8511d66/3CFOBG_gm4WlQpQXXaQlr.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Julian Eisenschlos", "user":"eisenjulian", "type":"user" }, "name":"Julian Martin Eisenschlos", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:21:58.800Z", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359ef", "user":{ "avatarUrl":"/avatars/bae7b5297e779d91da21ff50389dd90a.svg", "isPro":false, "fullname":"Vincent", "user":"vperot", "type":"user" }, "name":"Vincent Perot", "status":"claimed_verified", "statusLastChangedAt":"2024-01-17T08:49:08.221Z", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359f0", "user":{ "avatarUrl":"/avatars/4eb059260f06837192c98927254cd89e.svg", "isPro":false, "fullname":"Zifeng Wang", "user":"zifengw", "type":"user" }, "name":"Zifeng Wang", "status":"claimed_verified", "statusLastChangedAt":"2024-01-10T10:39:32.480Z", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359f1", "name":"Lesly Miculicich", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359f2", "name":"Yasuhisa Fujii", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359f3", "name":"Jingbo Shang", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359f4", "user":{ "avatarUrl":"/avatars/e2bb108d1f6b1383f0e3f263c4747b3d.svg", "isPro":false, "fullname":"Chen-Yu Lee", "user":"chenyulee", "type":"user" }, "name":"Chen-Yu Lee", "status":"extracted_confirmed", "statusLastChangedAt":"2024-01-10T05:20:13.007Z", "hidden":false }, { "_id":"659e0823b03a0f0e4f5359f5", "name":"Tomas Pfister", "hidden":false } ], "publishedAt":"2024-01-09T07:46:26.000Z", "title":"Chain-of-Table: Evolving Tables in the Reasoning Chain for Table\n Understanding", "summary":"Table-based reasoning with large language models (LLMs) is a promising\ndirection to tackle many table understanding tasks, such as table-based\nquestion answering and fact verification. Compared with generic reasoning,\ntable-based reasoning requires the extraction of underlying semantics from both\nfree-form questions and semi-structured tabular data. Chain-of-Thought and its\nsimilar approaches incorporate the reasoning chain in the form of textual\ncontext, but it is still an open question how to effectively leverage tabular\ndata in the reasoning chain. We propose the Chain-of-Table framework, where\ntabular data is explicitly used in the reasoning chain as a proxy for\nintermediate thoughts. Specifically, we guide LLMs using in-context learning to\niteratively generate operations and update the table to represent a tabular\nreasoning chain. LLMs can therefore dynamically plan the next operation based\non the results of the previous ones. This continuous evolution of the table\nforms a chain, showing the reasoning process for a given tabular problem. The\nchain carries structured information of the intermediate results, enabling more\naccurate and reliable predictions. Chain-of-Table achieves new state-of-the-art\nperformance on WikiTQ, FeTaQA, and TabFact benchmarks across multiple LLM\nchoices.", "upvotes":12 }, "publishedAt":"2024-01-10T02:59:48.349Z", "title":"Chain-of-Table: Evolving Tables in the Reasoning Chain for Table Understanding", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/t6k-byY94Hycy27sBuGLf.png", "numComments":0 }, { "paper":{ "id":"2401.06129", "authors":[ { "_id":"65a0a233712e1321d0218825", "name":"Yue Zhao", "hidden":false }, { "_id":"65a0a233712e1321d0218826", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/tIVfxoAHAJ0sWyDMkaarA.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Long Zhao", "user":"garyzhao9012", "type":"user" }, "name":"Long Zhao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:44:06.358Z", "hidden":false }, { "_id":"65a0a233712e1321d0218827", "user":{ "avatarUrl":"/avatars/95ab04122ad626d74bf8a65d7b2bf48a.svg", "isPro":false, "fullname":"Xingyi Zhou", "user":"zhouxy", "type":"user" }, "name":"Xingyi Zhou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:44:12.690Z", "hidden":false }, { "_id":"65a0a233712e1321d0218828", "name":"Jialin Wu", "hidden":false }, { "_id":"65a0a233712e1321d0218829", "name":"Chun-Te Chu", "hidden":false }, { "_id":"65a0a233712e1321d021882a", "name":"Hui Miao", "hidden":false }, { "_id":"65a0a233712e1321d021882b", "name":"Florian Schroff", "hidden":false }, { "_id":"65a0a233712e1321d021882c", "name":"Hartwig Adam", "hidden":false }, { "_id":"65a0a233712e1321d021882d", "name":"Ting Liu", "hidden":false }, { "_id":"65a0a233712e1321d021882e", "name":"Boqing Gong", "hidden":false }, { "_id":"65a0a233712e1321d021882f", "name":"Philipp Krähenbühl", "hidden":false }, { "_id":"65a0a233712e1321d0218830", "name":"Liangzhe Yuan", "hidden":false } ], "publishedAt":"2024-01-11T18:59:53.000Z", "title":"Distilling Vision-Language Models on Millions of Videos", "summary":"The recent advance in vision-language models is largely attributed to the\nabundance of image-text data. We aim to replicate this success for\nvideo-language models, but there simply is not enough human-curated video-text\ndata available. We thus resort to fine-tuning a video-language model from a\nstrong image-language baseline with synthesized instructional data. The\nresulting video-language model is then used to auto-label millions of videos to\ngenerate high-quality captions. We show the adapted video-language model\nperforms well on a wide range of video-language benchmarks. For instance, it\nsurpasses the best prior result on open-ended NExT-QA by 2.8%. Besides, our\nmodel generates detailed descriptions for previously unseen videos, which\nprovide better textual supervision than existing methods. Experiments show that\na video-language dual-encoder model contrastively trained on these\nauto-generated captions is 3.8% better than the strongest baseline that also\nleverages vision-language models. Our best model outperforms state-of-the-art\nmethods on MSR-VTT zero-shot text-to-video retrieval by 6%.", "upvotes":11 }, "publishedAt":"2024-01-12T02:21:40.338Z", "title":"Distilling Vision-Language Models on Millions of Videos", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/K8J4uOEMHWSZWoa-h7f4Q.png", "numComments":0 }, { "paper":{ "id":"2401.04925", "authors":[ { "_id":"659f5b037edb2a072edfaca6", "user":{ "avatarUrl":"/avatars/aa4e3a9dd2e4c6c474790f169b70d52e.svg", "isPro":false, "fullname":"Mingyu.Jin", "user":"Mingyu111", "type":"user" }, "name":"Mingyu Jin", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:54:45.675Z", "hidden":false }, { "_id":"659f5b037edb2a072edfaca7", "name":"Qinkai Yu", "hidden":false }, { "_id":"659f5b037edb2a072edfaca8", "name":"Dong shu", "hidden":false }, { "_id":"659f5b037edb2a072edfaca9", "name":"Haiyan Zhao", "hidden":false }, { "_id":"659f5b037edb2a072edfacaa", "user":{ "avatarUrl":"/avatars/03651951ac9faadb25349e0eb6ae7266.svg", "isPro":false, "fullname":"Wenyue Hua", "user":"wenyueH", "type":"user" }, "name":"Wenyue Hua", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:55:32.448Z", "hidden":false }, { "_id":"659f5b037edb2a072edfacab", "user":{ "avatarUrl":"/avatars/1b52669fd3d01947a8035462e7063aea.svg", "isPro":false, "fullname":"yanda meng", "user":"bigmaxmax", "type":"user" }, "name":"Yanda Meng", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:55:39.764Z", "hidden":false }, { "_id":"659f5b037edb2a072edfacac", "user":{ "avatarUrl":"/avatars/85a87fa82f7579019a0c6212f03b6480.svg", "isPro":false, "fullname":"Yongfeng Zhang", "user":"evison", "type":"user" }, "name":"Yongfeng Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:55:46.463Z", "hidden":false }, { "_id":"659f5b037edb2a072edfacad", "name":"Mengnan Du", "hidden":false } ], "publishedAt":"2024-01-10T04:37:38.000Z", "title":"The Impact of Reasoning Step Length on Large Language Models", "summary":"Chain of Thought (CoT) is significant in improving the reasoning abilities of\nlarge language models (LLMs). However, the correlation between the\neffectiveness of CoT and the length of reasoning steps in prompts remains\nlargely unknown. To shed light on this, we have conducted several empirical\nexperiments to explore the relations. Specifically, we design experiments that\nexpand and compress the rationale reasoning steps within CoT demonstrations,\nwhile keeping all other factors constant. We have the following key findings.\nFirst, the results indicate that lengthening the reasoning steps in prompts,\neven without adding new information into the prompt, considerably enhances\nLLMs' reasoning abilities across multiple datasets. Alternatively, shortening\nthe reasoning steps, even while preserving the key information, significantly\ndiminishes the reasoning abilities of models. This finding highlights the\nimportance of the number of steps in CoT prompts and provides practical\nguidance to make better use of LLMs' potential in complex problem-solving\nscenarios. Second, we also investigated the relationship between the\nperformance of CoT and the rationales used in demonstrations. Surprisingly, the\nresult shows that even incorrect rationales can yield favorable outcomes if\nthey maintain the requisite length of inference. Third, we observed that the\nadvantages of increasing reasoning steps are task-dependent: simpler tasks\nrequire fewer steps, whereas complex tasks gain significantly from longer\ninference sequences.", "upvotes":11 }, "publishedAt":"2024-01-11T03:05:39.499Z", "title":"The Impact of Reasoning Step Length on Large Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/33G4qvyyqfGsMGMYGXjIj.png", "numComments":2 }, { "paper":{ "id":"2401.05033", "authors":[ { "_id":"659f5931210bd4a54e7fb38a", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1669982276443-noauth.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Dennis Ulmer", "user":"kaleidophon", "type":"user" }, "name":"Dennis Ulmer", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:59:25.771Z", "hidden":false }, { "_id":"659f5931210bd4a54e7fb38b", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1643336441806-noauth.png?w=200&h=200&f=face", "isPro":false, "fullname":"Elman Mansimov", "user":"mansimov", "type":"user" }, "name":"Elman Mansimov", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:59:35.919Z", "hidden":false }, { "_id":"659f5931210bd4a54e7fb38c", "user":{ "avatarUrl":"/avatars/645fd920492c16c90f86fffc0823a997.svg", "isPro":false, "fullname":"Lin Kaixiang", "user":"LeoSuk", "type":"user" }, "name":"Kaixiang Lin", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T10:59:47.945Z", "hidden":false }, { "_id":"659f5931210bd4a54e7fb38d", "name":"Justin Sun", "hidden":false }, { "_id":"659f5931210bd4a54e7fb38e", "name":"Xibin Gao", "hidden":false }, { "_id":"659f5931210bd4a54e7fb38f", "name":"Yi Zhang", "hidden":false } ], "publishedAt":"2024-01-10T09:49:10.000Z", "title":"Bootstrapping LLM-based Task-Oriented Dialogue Agents via Self-Talk", "summary":"Large language models (LLMs) are powerful dialogue agents, but specializing\nthem towards fulfilling a specific function can be challenging. Instructing\ntuning, i.e. tuning models on instruction and sample responses generated by\nhumans (Ouyang et al., 2022), has proven as an effective method to do so, yet\nrequires a number of data samples that a) might not be available or b) costly\nto generate. Furthermore, this cost increases when the goal is to make the LLM\nfollow a specific workflow within a dialogue instead of single instructions.\nInspired by the self-play technique in reinforcement learning and the use of\nLLMs to simulate human agents, we propose a more effective method for data\ncollection through LLMs engaging in a conversation in various roles. This\napproach generates a training data via \"self-talk\" of LLMs that can be refined\nand utilized for supervised fine-tuning. We introduce an automated way to\nmeasure the (partial) success of a dialogue. This metric is used to filter the\ngenerated conversational data that is fed back in LLM for training. Based on\nour automated and human evaluations of conversation quality, we demonstrate\nthat such self-talk data improves results. In addition, we examine the various\ncharacteristics that showcase the quality of generated dialogues and how they\ncan be connected to their potential utility as training data.", "upvotes":10 }, "publishedAt":"2024-01-11T02:57:54.804Z", "title":"Bootstrapping LLM-based Task-Oriented Dialogue Agents via Self-Talk", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/krSRbXOs5mGCADXGa_929.png", "numComments":0 }, { "paper":{ "id":"2401.03506", "authors":[ { "_id":"659cc8d41d398a23816921bd", "user":{ "avatarUrl":"/avatars/7f69e4ed9b57fd4eb4bf5bdba0fbd1ce.svg", "isPro":false, "fullname":"Quan", "user":"wq2012", "type":"user" }, "name":"Quan Wang", "status":"extracted_confirmed", "statusLastChangedAt":"2024-01-09T13:43:13.822Z", "hidden":false }, { "_id":"659cc8d41d398a23816921be", "name":"Yiling Huang", "hidden":false }, { "_id":"659cc8d41d398a23816921bf", "name":"Guanlong Zhao", "hidden":false }, { "_id":"659cc8d41d398a23816921c0", "name":"Evan Clark", "hidden":false }, { "_id":"659cc8d41d398a23816921c1", "name":"Wei Xia", "hidden":false }, { "_id":"659cc8d41d398a23816921c2", "user":{ "avatarUrl":"/avatars/3249c16a373becf94bf13155b13d487e.svg", "isPro":false, "fullname":"Hank Liao", "user":"HankLiao", "type":"user" }, "name":"Hank Liao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:33:50.308Z", "hidden":false } ], "publishedAt":"2024-01-07T14:54:57.000Z", "title":"DiarizationLM: Speaker Diarization Post-Processing with Large Language\n Models", "summary":"In this paper, we introduce DiarizationLM, a framework to leverage large\nlanguage models (LLM) to post-process the outputs from a speaker diarization\nsystem. Various goals can be achieved with the proposed framework, such as\nimproving the readability of the diarized transcript, or reducing the word\ndiarization error rate (WDER). In this framework, the outputs of the automatic\nspeech recognition (ASR) and speaker diarization systems are represented as a\ncompact textual format, which is included in the prompt to an optionally\nfinetuned LLM. The outputs of the LLM can be used as the refined diarization\nresults with the desired enhancement. As a post-processing step, this framework\ncan be easily applied to any off-the-shelf ASR and speaker diarization systems\nwithout retraining existing components. Our experiments show that a finetuned\nPaLM 2-S model can reduce the WDER by rel. 25.9% on the Fisher telephone\nconversation dataset, and rel. 31% on the Callhome English dataset.", "upvotes":9 }, "publishedAt":"2024-01-09T04:17:25.441Z", "title":"DiarizationLM: Speaker Diarization Post-Processing with Large Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/LC5oryAGvz85jooNfQMGk.gif", "numComments":1 }, { "paper":{ "id":"2401.03003", "authors":[ { "_id":"659cbf6f0030e8faffb5e657", "user":{ "avatarUrl":"/avatars/00cbf728f69061de35f2d483caa08f8e.svg", "isPro":false, "fullname":"Linyuan Gong", "user":"gonglinyuan", "type":"user" }, "name":"Linyuan Gong", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T08:53:07.555Z", "hidden":false }, { "_id":"659cbf6f0030e8faffb5e658", "user":{ "avatarUrl":"/avatars/b46a0566af945bc871b9ff00f752fa28.svg", "isPro":false, "fullname":"Mostafa Elhoushi", "user":"melhoushi", "type":"user" }, "name":"Mostafa Elhoushi", "status":"extracted_confirmed", "statusLastChangedAt":"2024-01-09T12:57:02.649Z", "hidden":false }, { "_id":"659cbf6f0030e8faffb5e659", "name":"Alvin Cheung", "hidden":false } ], "publishedAt":"2024-01-05T06:51:08.000Z", "title":"AST-T5: Structure-Aware Pretraining for Code Generation and\n Understanding", "summary":"Large language models (LLMs) have made significant advancements in\ncode-related tasks, yet many LLMs treat code as simple sequences, neglecting\nits structured nature. We introduce AST-T5, a novel pretraining paradigm that\nleverages the Abstract Syntax Tree (AST) for enhanced code generation,\ntranspilation, and understanding. Using dynamic programming, our AST-Aware\nSegmentation retains code structure, while our AST-Aware Span Corruption\nobjective equips the model to reconstruct various code structures. Unlike other\nmodels, AST-T5 avoids intricate program analyses or architectural changes, so\nit integrates seamlessly with any encoder-decoder Transformer. Evaluations show\nthat AST-T5 consistently outperforms similar-sized LMs across various\ncode-related tasks. Structure-awareness makes AST-T5 particularly powerful in\ncode-to-code tasks, surpassing CodeT5 by 2 points in exact match score for the\nBugs2Fix task and by 3 points in exact match score for Java-C# Transpilation in\nCodeXGLUE. Our code and model are publicly available at\nhttps://github.com/gonglinyuan/ast_t5.", "upvotes":8 }, "publishedAt":"2024-01-09T03:37:19.850Z", "title":"AST-T5: Structure-Aware Pretraining for Code Generation and Understanding", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/6s_wYAiSQa5MHBPEoffXb.png", "numComments":2 }, { "paper":{ "id":"2401.02987", "authors":[ { "_id":"659cbe46442979361cd46580", "user":{ "avatarUrl":"/avatars/6bddc3e0d04775aaff9fe71c5f237baa.svg", "isPro":false, "fullname":"Prince Aboagye", "user":"paboagye", "type":"user" }, "name":"Prince Aboagye", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:07:54.590Z", "hidden":false }, { "_id":"659cbe46442979361cd46581", "name":"Yan Zheng", "hidden":false }, { "_id":"659cbe46442979361cd46582", "name":"Junpeng Wang", "hidden":false }, { "_id":"659cbe46442979361cd46583", "user":{ "avatarUrl":"/avatars/df548841cbf388f1de2f0245fa818fef.svg", "isPro":false, "fullname":"Uday Singh Saini", "user":"lazypikachu23", "type":"user" }, "name":"Uday Singh Saini", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:09:31.635Z", "hidden":false }, { "_id":"659cbe46442979361cd46584", "name":"Xin Dai", "hidden":false }, { "_id":"659cbe46442979361cd46585", "user":{ "avatarUrl":"/avatars/bffd1cc6e4920422ed577d4421057ca3.svg", "isPro":false, "fullname":"Michael Yeh", "user":"NCGTrigger", "type":"user" }, "name":"Michael Yeh", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:10:12.679Z", "hidden":false }, { "_id":"659cbe46442979361cd46586", "user":{ "avatarUrl":"/avatars/43d3b12c4df7db2f3fadc2c911a59908.svg", "isPro":false, "fullname":"fanyujie", "user":"fanyujie", "type":"user" }, "name":"Yujie Fan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:10:22.966Z", "hidden":false }, { "_id":"659cbe46442979361cd46587", "name":"Zhongfang Zhuang", "hidden":false }, { "_id":"659cbe46442979361cd46588", "user":{ "avatarUrl":"/avatars/8be23c2f74f94ee2aa2ffa78dddaf2e9.svg", "isPro":false, "fullname":"shubham jain", "user":"shubhjai", "type":"user" }, "name":"Shubham Jain", "status":"claimed_verified", "statusLastChangedAt":"2024-01-10T08:38:55.246Z", "hidden":false }, { "_id":"659cbe46442979361cd46589", "name":"Liang Wang", "hidden":false }, { "_id":"659cbe46442979361cd4658a", "name":"Wei Zhang", "hidden":false } ], "publishedAt":"2024-01-02T17:08:26.000Z", "title":"Has Your Pretrained Model Improved? A Multi-head Posterior Based\n Approach", "summary":"The emergence of pretrained models has significantly impacted from Natural\nLanguage Processing (NLP) and Computer Vision to relational datasets.\nTraditionally, these models are assessed through fine-tuned downstream tasks.\nHowever, this raises the question of how to evaluate these models more\nefficiently and more effectively. In this study, we explore a novel approach\nwhere we leverage the meta features associated with each entity as a source of\nworldly knowledge and employ entity representations from the models. We propose\nusing the consistency between these representations and the meta features as a\nmetric for evaluating pretrained models. Our method's effectiveness is\ndemonstrated across various domains, including models with relational datasets,\nlarge language models and images models.", "upvotes":8 }, "publishedAt":"2024-01-09T03:32:24.422Z", "title":"Has Your Pretrained Model Improved? A Multi-head Posterior Based Approach", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/rgRKYm42JfVm3g01HKNVT.png", "numComments":0 }, { "paper":{ "id":"2401.07519", "authors":[ { "_id":"65a76b903d3c83940823ebbe", "user":{ "avatarUrl":"/avatars/4c4c0aeea3ba01df3e50f0dfe21d49d1.svg", "isPro":false, "fullname":"wangqixun", "user":"wangqixun", "type":"user" }, "name":"Qixun Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:14:57.341Z", "hidden":false }, { "_id":"65a76b903d3c83940823ebbf", "name":"Xu Bai", "hidden":false }, { "_id":"65a76b903d3c83940823ebc0", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1669187672174-637745113a63a2983ffbde13.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Haofan Wang", "user":"wanghaofan", "type":"user" }, "name":"Haofan Wang", "status":"extracted_pending", "statusLastChangedAt":"2024-01-17T05:54:30.018Z", "hidden":false }, { "_id":"65a76b903d3c83940823ebc1", "name":"Zekui Qin", "hidden":false }, { "_id":"65a76b903d3c83940823ebc2", "name":"Anthony Chen", "hidden":false } ], "publishedAt":"2024-01-15T07:50:18.000Z", "title":"InstantID: Zero-shot Identity-Preserving Generation in Seconds", "summary":"There has been significant progress in personalized image synthesis with\nmethods such as Textual Inversion, DreamBooth, and LoRA. Yet, their real-world\napplicability is hindered by high storage demands, lengthy fine-tuning\nprocesses, and the need for multiple reference images. Conversely, existing ID\nembedding-based methods, while requiring only a single forward inference, face\nchallenges: they either necessitate extensive fine-tuning across numerous model\nparameters, lack compatibility with community pre-trained models, or fail to\nmaintain high face fidelity. Addressing these limitations, we introduce\nInstantID, a powerful diffusion model-based solution. Our plug-and-play module\nadeptly handles image personalization in various styles using just a single\nfacial image, while ensuring high fidelity. To achieve this, we design a novel\nIdentityNet by imposing strong semantic and weak spatial conditions,\nintegrating facial and landmark images with textual prompts to steer the image\ngeneration. InstantID demonstrates exceptional performance and efficiency,\nproving highly beneficial in real-world applications where identity\npreservation is paramount. Moreover, our work seamlessly integrates with\npopular pre-trained text-to-image diffusion models like SD1.5 and SDXL, serving\nas an adaptable plugin. Our codes and pre-trained checkpoints will be available\nat https://github.com/InstantID/InstantID.", "upvotes":7 }, "publishedAt":"2024-01-17T05:54:30.078Z", "title":"InstantID: Zero-shot Identity-Preserving Generation in Seconds", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/hsBzAOuE6olEuzTfBGnax.png", "numComments":1 }, { "paper":{ "id":"2401.05583", "authors":[ { "_id":"65a0af0126d1e9df4fe416ce", "user":{ "avatarUrl":"/avatars/d4730f70a6f1cb51266862c7a8d54f77.svg", "isPro":false, "fullname":"Chaoyang Wang", "user":"cwang9", "type":"user" }, "name":"Chaoyang Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:16:02.817Z", "hidden":false }, { "_id":"65a0af0126d1e9df4fe416cf", "user":{ "avatarUrl":"/avatars/6973b4c959ef8bb72c40c611d384a78a.svg", "isPro":false, "fullname":"peiye zhuang", "user":"Kelest", "type":"user" }, "name":"Peiye Zhuang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:16:08.804Z", "hidden":false }, { "_id":"65a0af0126d1e9df4fe416d0", "user":{ "avatarUrl":"/avatars/76f933cd549f10e5e2db379de235d304.svg", "isPro":false, "fullname":"Aliaksandr Siarohin", "user":"aliaksandr-siarohin", "type":"user" }, "name":"Aliaksandr Siarohin", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:16:15.192Z", "hidden":false }, { "_id":"65a0af0126d1e9df4fe416d1", "user":{ "avatarUrl":"/avatars/c73c5870039611ab9162daad46a1ba20.svg", "isPro":false, "fullname":"junli cao", "user":"jlcao2", "type":"user" }, "name":"Junli Cao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:16:21.963Z", "hidden":false }, { "_id":"65a0af0126d1e9df4fe416d2", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/645fed74335c21d19f3bf76c/gwVsllRWtSHbg4a1erkdF.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Guocheng Qian", "user":"guochengqian", "type":"user" }, "name":"Guocheng Qian", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:16:28.558Z", "hidden":false }, { "_id":"65a0af0126d1e9df4fe416d3", "user":{ "avatarUrl":"/avatars/1cb94deadc288d07c571c9289085548c.svg", "isPro":false, "fullname":"Hsin-Ying Lee", "user":"james371507", "type":"user" }, "name":"Hsin-Ying Lee", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:16:34.561Z", "hidden":false }, { "_id":"65a0af0126d1e9df4fe416d4", "name":"Sergey Tulyakov", "hidden":false } ], "publishedAt":"2024-01-10T23:26:41.000Z", "title":"Diffusion Priors for Dynamic View Synthesis from Monocular Videos", "summary":"Dynamic novel view synthesis aims to capture the temporal evolution of visual\ncontent within videos. Existing methods struggle to distinguishing between\nmotion and structure, particularly in scenarios where camera poses are either\nunknown or constrained compared to object motion. Furthermore, with information\nsolely from reference images, it is extremely challenging to hallucinate unseen\nregions that are occluded or partially observed in the given videos. To address\nthese issues, we first finetune a pretrained RGB-D diffusion model on the video\nframes using a customization technique. Subsequently, we distill the knowledge\nfrom the finetuned model to a 4D representations encompassing both dynamic and\nstatic Neural Radiance Fields (NeRF) components. The proposed pipeline achieves\ngeometric consistency while preserving the scene identity. We perform thorough\nexperiments to evaluate the efficacy of the proposed method qualitatively and\nquantitatively. Our results demonstrate the robustness and utility of our\napproach in challenging cases, further advancing dynamic novel view synthesis.", "upvotes":7 }, "publishedAt":"2024-01-12T03:16:20.134Z", "title":"Diffusion Priors for Dynamic View Synthesis from Monocular Videos", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/ha--2B5SzZn5_Y7poI_yS.png", "numComments":0 }, { "paper":{ "id":"2401.05391", "authors":[ { "_id":"65a09e24e0b49a84d21cca89", "user":{ "avatarUrl":"/avatars/27c5947d20cc91bf5ef61b48d2c50a9c.svg", "isPro":false, "fullname":"Hui Wu", "user":"wuhuikxCSU", "type":"user" }, "name":"Hui Wu", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T10:46:08.306Z", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca8a", "name":"Yi Gan", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca8b", "user":{ "avatarUrl":"/avatars/d57e51c4fa0bd3da1d1a96e1a18050f2.svg", "isPro":false, "fullname":"Feng Yuan", "user":"arthuryuan1987", "type":"user" }, "name":"Feng Yuan", "status":"claimed_verified", "statusLastChangedAt":"2024-01-15T07:42:04.025Z", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca8c", "name":"Jing Ma", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca8d", "name":"Wei Zhu", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca8e", "user":{ "avatarUrl":"/avatars/23db8b19705a5e5f0b2bc2e2f4b0d8d0.svg", "isPro":false, "fullname":"xuyutao", "user":"vqaFW", "type":"user" }, "name":"Yutao Xu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:39:14.660Z", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca8f", "name":"Hong Zhu", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca90", "user":{ "avatarUrl":"/avatars/16aee3d0e7ca35cf0d036aa71e7738e2.svg", "isPro":false, "fullname":"zhuyuhua", "user":"TakaRika", "type":"user" }, "name":"Yuhua Zhu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:39:29.465Z", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca91", "name":"Xiaoli Liu", "hidden":false }, { "_id":"65a09e24e0b49a84d21cca92", "name":"Jinghui Gu", "hidden":false } ], "publishedAt":"2023-12-19T05:40:43.000Z", "title":"Efficient LLM inference solution on Intel GPU", "summary":"Transformer based Large Language Models (LLMs) have been widely used in many\nfields, and the efficiency of LLM inference becomes hot topic in real\napplications. However, LLMs are usually complicatedly designed in model\nstructure with massive operations and perform inference in the auto-regressive\nmode, making it a challenging task to design a system with high efficiency.\n In this paper, we propose an efficient LLM inference solution with low\nlatency and high throughput. Firstly, we simplify the LLM decoder layer by\nfusing data movement and element-wise operations to reduce the memory access\nfrequency and lower system latency. We also propose a segment KV cache policy\nto keep key/value of the request and response tokens in separate physical\nmemory for effective device memory management, helping enlarge the runtime\nbatch size and improve system throughput. A customized\nScaled-Dot-Product-Attention kernel is designed to match our fusion policy\nbased on the segment KV cache solution. We implement our LLM inference solution\non Intel GPU and publish it publicly. Compared with the standard HuggingFace\nimplementation, the proposed solution achieves up to 7x lower token latency and\n27x higher throughput for some popular LLMs on Intel GPU.", "upvotes":7 }, "publishedAt":"2024-01-12T02:04:21.187Z", "title":"Efficient LLM inference solution on Intel GPU", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/6gmOA3lUAt4669tHBzvKy.png", "numComments":1 }, { "paper":{ "id":"2401.05314", "authors":[ { "_id":"659f6fbec0b775ab966257dd", "name":"Kevin Cai", "hidden":false }, { "_id":"659f6fbec0b775ab966257de", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/E8ndPzmtufhO9tvr4GsPi.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Chonghua Liu", "user":"mliu0515", "type":"user" }, "name":"Chonghua Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T11:18:43.130Z", "hidden":false }, { "_id":"659f6fbec0b775ab966257df", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1669920369989-noauth.jpeg?w=200&h=200&f=face", "isPro":true, "fullname":"David Chan", "user":"davidchan", "type":"user" }, "name":"David M. Chan", "status":"extracted_confirmed", "statusLastChangedAt":"2024-01-11T06:35:30.606Z", "hidden":false } ], "publishedAt":"2024-01-10T18:32:38.000Z", "title":"ANIM-400K: A Large-Scale Dataset for Automated End-To-End Dubbing of\n Video", "summary":"The Internet's wealth of content, with up to 60% published in English,\nstarkly contrasts the global population, where only 18.8% are English speakers,\nand just 5.1% consider it their native language, leading to disparities in\nonline information access. Unfortunately, automated processes for dubbing of\nvideo - replacing the audio track of a video with a translated alternative -\nremains a complex and challenging task due to pipelines, necessitating precise\ntiming, facial movement synchronization, and prosody matching. While end-to-end\ndubbing offers a solution, data scarcity continues to impede the progress of\nboth end-to-end and pipeline-based methods. In this work, we introduce\nAnim-400K, a comprehensive dataset of over 425K aligned animated video segments\nin Japanese and English supporting various video-related tasks, including\nautomated dubbing, simultaneous translation, guided video summarization, and\ngenre/theme/style classification. Our dataset is made publicly available for\nresearch purposes at https://github.com/davidmchan/Anim400K.", "upvotes":7 }, "publishedAt":"2024-01-11T04:34:06.969Z", "title":"ANIM-400K: A Large-Scale Dataset for Automated End-To-End Dubbing of Video", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/6cFgjM4Ek9tW3hFNXVtyz.png", "numComments":0 }, { "paper":{ "id":"2401.04695", "authors":[ { "_id":"659e0b010ce6bc9fbd08909b", "user":{ "avatarUrl":"/avatars/6bd01b4a6ea1e3a51d739d4104eb4ca9.svg", "isPro":false, "fullname":"Gal Yona", "user":"galyona", "type":"user" }, "name":"Gal Yona", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:36:27.520Z", "hidden":false }, { "_id":"659e0b010ce6bc9fbd08909c", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1667756718733-6363bf2b123a5d5cd4a8fe7c.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Roee Aharoni", "user":"roeeaharoni", "type":"user" }, "name":"Roee Aharoni", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:36:34.010Z", "hidden":false }, { "_id":"659e0b010ce6bc9fbd08909d", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1628140189042-noauth.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Mor Geva", "user":"mega", "type":"user" }, "name":"Mor Geva", "status":"admin_assigned", "statusLastChangedAt":"2024-01-10T09:36:41.188Z", "hidden":false } ], "publishedAt":"2024-01-09T17:44:36.000Z", "title":"Narrowing the Knowledge Evaluation Gap: Open-Domain Question Answering\n with Multi-Granularity Answers", "summary":"Factual questions typically can be answered correctly at different levels of\ngranularity. For example, both ``August 4, 1961'' and ``1961'' are correct\nanswers to the question ``When was Barack Obama born?''. Standard question\nanswering (QA) evaluation protocols, however, do not explicitly take this into\naccount and compare a predicted answer against answers of a single granularity\nlevel. In this work, we propose GRANOLA QA, a novel evaluation setting where a\npredicted answer is evaluated in terms of accuracy and informativeness against\na set of multi-granularity answers. We present a simple methodology for\nenriching existing datasets with multi-granularity answers, and create\nGRANOLA-EQ, a multi-granularity version of the EntityQuestions dataset. We\nevaluate a range of decoding methods on GRANOLA-EQ, including a new algorithm,\ncalled Decoding with Response Aggregation (DRAG), that is geared towards\naligning the response granularity with the model's uncertainty. Our experiments\nshow that large language models with standard decoding tend to generate\nspecific answers, which are often incorrect. In contrast, when evaluated on\nmulti-granularity answers, DRAG yields a nearly 20 point increase in accuracy\non average, which further increases for rare entities. Overall, this reveals\nthat standard evaluation and decoding schemes may significantly underestimate\nthe knowledge encapsulated in LMs.", "upvotes":7 }, "publishedAt":"2024-01-10T03:12:03.050Z", "title":"Narrowing the Knowledge Evaluation Gap: Open-Domain Question Answering with Multi-Granularity Answers", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/uEAx_Atd2-VvoKf-cbjk9.png", "numComments":0 }, { "paper":{ "id":"2401.03804", "authors":[ { "_id":"659cc74abc65f1e59d02aa56", "name":"Zihan Wang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa57", "name":"Xinzhang Liu", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa58", "name":"Shixuan Liu", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa59", "name":"Yitong Yao", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa5a", "name":"Yuyao Huang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa5b", "name":"Zhongjiang He", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa5c", "name":"Xuelong Li", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa5d", "name":"Yongxiang Li", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa5e", "name":"Zhonghao Che", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa5f", "name":"Zhaoxi Zhang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa60", "name":"Yan Wang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa61", "name":"Xin Wang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa62", "name":"Luwen Pu", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa63", "name":"Huihan Xu", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa64", "name":"Ruiyu Fang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa65", "name":"Yu Zhao", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa66", "name":"Jie Zhang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa67", "name":"Xiaomeng Huang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa68", "name":"Zhilong Lu", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa69", "name":"Jiaxin Peng", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa6a", "name":"Wenjun Zheng", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa6b", "name":"Shiquan Wang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa6c", "name":"Bingkai Yang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa6d", "name":"Xuewei he", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa6e", "name":"Zhuoru Jiang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa6f", "name":"Qiyi Xie", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa70", "name":"Yanhan Zhang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa71", "name":"Zhongqiu Li", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa72", "name":"Lingling Shi", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa73", "name":"Weiwei Fu", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa74", "name":"Yin Zhang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa75", "name":"Zilu Huang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa76", "name":"Sishi Xiong", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa77", "name":"Yuxiang Zhang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa78", "name":"Chao Wang", "hidden":false }, { "_id":"659cc74abc65f1e59d02aa79", "name":"Shuangyong Song", "hidden":false } ], "publishedAt":"2024-01-08T10:43:19.000Z", "title":"TeleChat Technical Report", "summary":"In this technical report, we present TeleChat, a collection of large language\nmodels (LLMs) with parameters of 3 billion, 7 billion and 12 billion. It\nincludes pretrained language models as well as fine-tuned chat models that is\naligned with human preferences. TeleChat is initially pretrained on an\nextensive corpus containing a diverse collection of texts from both English and\nChinese languages, including trillions of tokens. Subsequently, the model\nundergoes fine-tuning to align with human preferences, following a detailed\nmethodology that we describe. We evaluate the performance of TeleChat on\nvarious tasks, including language understanding, mathematics, reasoning, code\ngeneration, and knowledge-based question answering. Our findings indicate that\nTeleChat achieves comparable performance to other open-source models of similar\nsize across a wide range of public benchmarks. To support future research and\napplications utilizing LLMs, we release the fine-tuned model checkpoints of\nTeleChat's 7B and 12B variant, along with code and a portion of our pretraining\ndata, to the public community.", "upvotes":7 }, "publishedAt":"2024-01-09T04:10:53.530Z", "title":"TeleChat Technical Report", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/WM4E6KEA8OYnGTFEJkSEj.png", "numComments":0 }, { "paper":{ "id":"2401.05735", "authors":[ { "_id":"65a0ba66c5770b27aebc4d19", "user":{ "avatarUrl":"/avatars/ce4d46f575ba757f78eabdb25b394171.svg", "isPro":false, "fullname":"Kumara Kahatapitiya", "user":"kumarak", "type":"user" }, "name":"Kumara Kahatapitiya", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T13:42:11.048Z", "hidden":false }, { "_id":"65a0ba66c5770b27aebc4d1a", "name":"Adil Karjauv", "hidden":false }, { "_id":"65a0ba66c5770b27aebc4d1b", "name":"Davide Abati", "hidden":false }, { "_id":"65a0ba66c5770b27aebc4d1c", "name":"Fatih Porikli", "hidden":false }, { "_id":"65a0ba66c5770b27aebc4d1d", "name":"Yuki M. Asano", "hidden":false }, { "_id":"65a0ba66c5770b27aebc4d1e", "user":{ "avatarUrl":"/avatars/c1e34dee217d73f5539a0807adc12c80.svg", "isPro":false, "fullname":"Amir Habibian", "user":"habibian", "type":"user" }, "name":"Amirhossein Habibian", "status":"claimed_verified", "statusLastChangedAt":"2024-01-12T10:34:12.576Z", "hidden":false } ], "publishedAt":"2024-01-11T08:36:15.000Z", "title":"Object-Centric Diffusion for Efficient Video Editing", "summary":"Diffusion-based video editing have reached impressive quality and can\ntransform either the global style, local structure, and attributes of given\nvideo inputs, following textual edit prompts. However, such solutions typically\nincur heavy memory and computational costs to generate temporally-coherent\nframes, either in the form of diffusion inversion and/or cross-frame attention.\nIn this paper, we conduct an analysis of such inefficiencies, and suggest\nsimple yet effective modifications that allow significant speed-ups whilst\nmaintaining quality. Moreover, we introduce Object-Centric Diffusion, coined as\nOCD, to further reduce latency by allocating computations more towards\nforeground edited regions that are arguably more important for perceptual\nquality. We achieve this by two novel proposals: i) Object-Centric Sampling,\ndecoupling the diffusion steps spent on salient regions or background,\nallocating most of the model capacity to the former, and ii) Object-Centric 3D\nToken Merging, which reduces cost of cross-frame attention by fusing redundant\ntokens in unimportant background regions. Both techniques are readily\napplicable to a given video editing model without retraining, and can\ndrastically reduce its memory and computational cost. We evaluate our proposals\non inversion-based and control-signal-based editing pipelines, and show a\nlatency reduction up to 10x for a comparable synthesis quality.", "upvotes":6 }, "publishedAt":"2024-01-12T04:04:54.996Z", "title":"Object-Centric Diffusion for Efficient Video Editing", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/dfY_j3tQjWu7bXOhNTjU9.png", "numComments":0 }, { "paper":{ "id":"2401.06071", "authors":[ { "_id":"65a0a7315dfd8b9b1f288fe6", "name":"Zhaowei Li", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fe7", "name":"Qi Xu", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fe8", "name":"Dong Zhang", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fe9", "name":"Hang Song", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fea", "user":{ "avatarUrl":"/avatars/c740335400421904c06d1c576cb8ae87.svg", "isPro":false, "fullname":"Yiqing Cai", "user":"SmallCai", "type":"user" }, "name":"Yiqing Cai", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:30:47.851Z", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288feb", "name":"Qi Qi", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fec", "name":"Ran Zhou", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fed", "user":{ "avatarUrl":"/avatars/a23d57dcb190711494a2a7e75e4cb9a8.svg", "isPro":false, "fullname":"Junting Pan", "user":"juntingpan", "type":"user" }, "name":"Junting Pan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:30:22.022Z", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fee", "name":"Zefeng Li", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288fef", "name":"Van Tu Vu", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288ff0", "name":"Zhida Huang", "hidden":false }, { "_id":"65a0a7315dfd8b9b1f288ff1", "name":"Tao Wang", "hidden":false } ], "publishedAt":"2024-01-11T17:41:57.000Z", "title":"LEGO:Language Enhanced Multi-modal Grounding Model", "summary":"Multi-modal large language models have demonstrated impressive performance\nacross various tasks in different modalities. However, existing multi-modal\nmodels primarily emphasize capturing global information within each modality\nwhile neglecting the importance of perceiving local information across\nmodalities. Consequently, these models lack the ability to effectively\nunderstand the fine-grained details of input data, limiting their performance\nin tasks that require a more nuanced understanding. To address this limitation,\nthere is a compelling need to develop models that enable fine-grained\nunderstanding across multiple modalities, thereby enhancing their applicability\nto a wide range of tasks. In this paper, we propose LEGO, a language enhanced\nmulti-modal grounding model. Beyond capturing global information like other\nmulti-modal models, our proposed model excels at tasks demanding a detailed\nunderstanding of local information within the input. It demonstrates precise\nidentification and localization of specific regions in images or moments in\nvideos. To achieve this objective, we design a diversified dataset construction\npipeline, resulting in a multi-modal, multi-granularity dataset for model\ntraining. The code, dataset, and demo of our model can be found at https:\n//github.com/lzw-lzw/LEGO.", "upvotes":6 }, "publishedAt":"2024-01-12T02:42:58.364Z", "title":"LEGO:Language Enhanced Multi-modal Grounding Model", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/RS19u0XvuZMiB3d0swKNN.png", "numComments":0 }, { "paper":{ "id":"2401.08417", "authors":[ { "_id":"65a776634908b2676c709a60", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1631080954171-61384b860317b0a5c10877d3.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Haoran Xu", "user":"haoranxu", "type":"user" }, "name":"Haoran Xu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:39:13.948Z", "hidden":false }, { "_id":"65a776634908b2676c709a61", "user":{ "avatarUrl":"/avatars/95bbb992cadfc8c3523ec2850d6e1e65.svg", "isPro":false, "fullname":"Amr Sharaf", "user":"amrsharaf", "type":"user" }, "name":"Amr Sharaf", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:39:21.852Z", "hidden":false }, { "_id":"65a776634908b2676c709a62", "user":{ "avatarUrl":"/avatars/5c6ac56171a24ee16098bf9c8dcd82f7.svg", "isPro":false, "fullname":"Yunmo Chen", "user":"yunmochen", "type":"user" }, "name":"Yunmo Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:40:07.362Z", "hidden":false }, { "_id":"65a776634908b2676c709a63", "name":"Weiting Tan", "hidden":false }, { "_id":"65a776634908b2676c709a64", "name":"Lingfeng Shen", "hidden":false }, { "_id":"65a776634908b2676c709a65", "name":"Benjamin Van Durme", "hidden":false }, { "_id":"65a776634908b2676c709a66", "user":{ "avatarUrl":"/avatars/e2778c566b77d21a849e9dcef9bc686f.svg", "isPro":false, "fullname":"Kenton Murray", "user":"Kenton", "type":"user" }, "name":"Kenton Murray", "status":"extracted_pending", "statusLastChangedAt":"2024-01-17T06:40:35.869Z", "hidden":false }, { "_id":"65a776634908b2676c709a67", "user":{ "avatarUrl":"/avatars/0ee530cf80476aa3985c4d591cd384a1.svg", "isPro":false, "fullname":"Young Jin Kim", "user":"ykim362", "type":"user" }, "name":"Young Jin Kim", "status":"claimed_verified", "statusLastChangedAt":"2024-01-17T08:48:56.566Z", "hidden":false } ], "publishedAt":"2024-01-16T15:04:51.000Z", "title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM\n Performance in Machine Translation", "summary":"Moderate-sized large language models (LLMs) -- those with 7B or 13B\nparameters -- exhibit promising machine translation (MT) performance. However,\neven the top-performing 13B LLM-based translation models, like ALMA, does not\nmatch the performance of state-of-the-art conventional encoder-decoder\ntranslation models or larger-scale LLMs such as GPT-4. In this study, we bridge\nthis performance gap. We first assess the shortcomings of supervised\nfine-tuning for LLMs in the MT task, emphasizing the quality issues present in\nthe reference data, despite being human-generated. Then, in contrast to SFT\nwhich mimics reference translations, we introduce Contrastive Preference\nOptimization (CPO), a novel approach that trains models to avoid generating\nadequate but not perfect translations. Applying CPO to ALMA models with only\n22K parallel sentences and 12M parameters yields significant improvements. The\nresulting model, called ALMA-R, can match or exceed the performance of the WMT\ncompetition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.", "upvotes":5 }, "publishedAt":"2024-01-17T06:40:35.895Z", "title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/8z2LsPN_L9OdoevoTOZiO.png", "numComments":0 }, { "paper":{ "id":"2401.07781", "authors":[ { "_id":"65a76a9db4b9c261f7213bef", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/633aaf695df91da9cea92960/9T4y1ru5wt5iKUUqf9_Tt.png?w=200&h=200&f=face", "isPro":false, "fullname":"Jay Wu", "user":"jayw", "type":"user" }, "name":"Jay Zhangjie Wu", "status":"claimed_verified", "statusLastChangedAt":"2024-01-17T08:48:59.173Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf0", "name":"Guian Fang", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf1", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/632c7a0d1d303f5f9acf01b8/T010IFuCp6UaOeIyWhbCk.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"HaoningWu", "user":"haoningwu", "type":"user" }, "name":"Haoning Wu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:34:26.979Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf2", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/60e272ca6c78a8c122b12127/xldEGBzGrU-bX6IwAw0Ie.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Xintao Wang", "user":"Xintao", "type":"user" }, "name":"Xintao Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:34:43.604Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf3", "user":{ "avatarUrl":"/avatars/81da37d628163fe3e094b247c7c3a3b5.svg", "isPro":false, "fullname":"Yixiao Ge", "user":"yxgeee", "type":"user" }, "name":"Yixiao Ge", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:34:54.861Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf4", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/63184c517ca1b876d99b7e0e/b-qDExoeJuDXK0cJBZKnz.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Xiaodong Cun", "user":"vinthony", "type":"user" }, "name":"Xiaodong Cun", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:35:06.208Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf5", "user":{ "avatarUrl":"/avatars/caa99c3318c1d6d21be44d2d32795a62.svg", "isPro":false, "fullname":"David Junhao ZHANG", "user":"Junhao233", "type":"user" }, "name":"David Junhao Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:35:12.573Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf6", "user":{ "avatarUrl":"/avatars/ac115f1a21c743db7c925f0e18451145.svg", "isPro":false, "fullname":"Jiawei Liu", "user":"JiaweiLIU", "type":"user" }, "name":"Jia-Wei Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:35:31.822Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf7", "user":{ "avatarUrl":"/avatars/7a7e8b39749eda61e57d8a1908726558.svg", "isPro":false, "fullname":"Gu Yuchao", "user":"guyuchao", "type":"user" }, "name":"Yuchao Gu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:35:59.418Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf8", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/652b83b73b5997ed71a310f2/h2WLp1Loiy59vwHCOCkTM.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Rui Zhao", "user":"ruizhaocv", "type":"user" }, "name":"Rui Zhao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:36:28.581Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bf9", "name":"Weisi Lin", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bfa", "name":"Wynne Hsu", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bfb", "user":{ "avatarUrl":"/avatars/615e0d8622950b4408b40d550f02a894.svg", "isPro":false, "fullname":"Ying Shan", "user":"yshan2u", "type":"user" }, "name":"Ying Shan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:37:06.242Z", "hidden":false }, { "_id":"65a76a9db4b9c261f7213bfc", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1671779060549-noauth.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Mike Shou", "user":"mikeshou", "type":"user" }, "name":"Mike Zheng Shou", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:38:02.872Z", "hidden":false } ], "publishedAt":"2024-01-15T15:42:39.000Z", "title":"Towards A Better Metric for Text-to-Video Generation", "summary":"Generative models have demonstrated remarkable capability in synthesizing\nhigh-quality text, images, and videos. For video generation, contemporary\ntext-to-video models exhibit impressive capabilities, crafting visually\nstunning videos. Nonetheless, evaluating such videos poses significant\nchallenges. Current research predominantly employs automated metrics such as\nFVD, IS, and CLIP Score. However, these metrics provide an incomplete analysis,\nparticularly in the temporal assessment of video content, thus rendering them\nunreliable indicators of true video quality. Furthermore, while user studies\nhave the potential to reflect human perception accurately, they are hampered by\ntheir time-intensive and laborious nature, with outcomes that are often tainted\nby subjective bias. In this paper, we investigate the limitations inherent in\nexisting metrics and introduce a novel evaluation pipeline, the Text-to-Video\nScore (T2VScore). This metric integrates two pivotal criteria: (1) Text-Video\nAlignment, which scrutinizes the fidelity of the video in representing the\ngiven text description, and (2) Video Quality, which evaluates the video's\noverall production caliber with a mixture of experts. Moreover, to evaluate the\nproposed metrics and facilitate future improvements on them, we present the\nTVGE dataset, collecting human judgements of 2,543 text-to-video generated\nvideos on the two criteria. Experiments on the TVGE dataset demonstrate the\nsuperiority of the proposed T2VScore on offering a better metric for\ntext-to-video generation.", "upvotes":5 }, "publishedAt":"2024-01-17T05:50:25.930Z", "title":"Towards A Better Metric for Text-to-Video Generation", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/Er9TRXk_yScajuisjQu55.mp4", "numComments":1 }, { "paper":{ "id":"2401.05811", "authors":[ { "_id":"65a0a5e5b1f0788359aa31b7", "user":{ "avatarUrl":"/avatars/7be333e4f80e668f951d1e4c7351b89b.svg", "isPro":false, "fullname":"Zhuoyuan Mao", "user":"kevinmzy", "type":"user" }, "name":"Zhuoyuan Mao", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:43:00.631Z", "hidden":false }, { "_id":"65a0a5e5b1f0788359aa31b8", "user":{ "avatarUrl":"/avatars/7be333e4f80e668f951d1e4c7351b89b.svg", "isPro":false, "fullname":"Zhuoyuan Mao", "user":"kevinmzy", "type":"user" }, "name":"Yen Yu", "status":"extracted_pending", "statusLastChangedAt":"2024-01-12T02:37:26.368Z", "hidden":false } ], "publishedAt":"2024-01-11T10:28:17.000Z", "title":"Tuning LLMs with Contrastive Alignment Instructions for Machine\n Translation in Unseen, Low-resource Languages", "summary":"This article introduces contrastive alignment instructions (AlignInstruct) to\naddress two challenges in machine translation (MT) on large language models\n(LLMs). One is the expansion of supported languages to previously unseen ones.\nThe second relates to the lack of data in low-resource languages. Model\nfine-tuning through MT instructions (MTInstruct) is a straightforward approach\nto the first challenge. However, MTInstruct is limited by weak cross-lingual\nsignals inherent in the second challenge. AlignInstruct emphasizes\ncross-lingual supervision via a cross-lingual discriminator built using\nstatistical word alignments. Our results based on fine-tuning the BLOOMZ models\n(1b1, 3b, and 7b1) in up to 24 unseen languages showed that: (1) LLMs can\neffectively translate unseen languages using MTInstruct; (2) AlignInstruct led\nto consistent improvements in translation quality across 48 translation\ndirections involving English; (3) Discriminator-based instructions outperformed\ntheir generative counterparts as cross-lingual instructions; (4) AlignInstruct\nimproved performance in 30 zero-shot directions.", "upvotes":5 }, "publishedAt":"2024-01-12T02:37:26.398Z", "title":"Tuning LLMs with Contrastive Alignment Instructions for Machine Translation in Unseen, Low-resource Languages", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/qCLE5QVk8VnxRObZRyEgJ.png", "numComments":0 }, { "paper":{ "id":"2401.05293", "authors":[ { "_id":"659f6862874e583fed675471", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/648c964d39d2584ee47af19c/5UEkzDTMY8I3svKjR7kxN.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Thiemo Alldieck", "user":"thiemoall", "type":"user" }, "name":"Thiemo Alldieck", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T11:20:20.428Z", "hidden":false }, { "_id":"659f6862874e583fed675472", "user":{ "avatarUrl":"/avatars/53163398f1a629b9838548b808af53f7.svg", "isPro":false, "fullname":"Nikos Kolotouros", "user":"kolotouros", "type":"user" }, "name":"Nikos Kolotouros", "status":"admin_assigned", "statusLastChangedAt":"2024-01-11T11:20:28.010Z", "hidden":false }, { "_id":"659f6862874e583fed675473", "name":"Cristian Sminchisescu", "hidden":false } ], "publishedAt":"2024-01-10T17:51:46.000Z", "title":"Score Distillation Sampling with Learned Manifold Corrective", "summary":"Score Distillation Sampling (SDS) is a recent but already widely popular\nmethod that relies on an image diffusion model to control optimization problems\nusing text prompts. In this paper, we conduct an in-depth analysis of the SDS\nloss function, identify an inherent problem with its formulation, and propose a\nsurprisingly easy but effective fix. Specifically, we decompose the loss into\ndifferent factors and isolate the component responsible for noisy gradients. In\nthe original formulation, high text guidance is used to account for the noise,\nleading to unwanted side effects. Instead, we train a shallow network mimicking\nthe timestep-dependent denoising deficiency of the image diffusion model in\norder to effectively factor it out. We demonstrate the versatility and the\neffectiveness of our novel loss formulation through several qualitative and\nquantitative experiments, including optimization-based image synthesis and\nediting, zero-shot image translation network training, and text-to-3D\nsynthesis.", "upvotes":5 }, "publishedAt":"2024-01-11T04:02:45.113Z", "title":"Score Distillation Sampling with Learned Manifold Corrective", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/1uTkfPtj2XaNvyKPtXB6S.png", "numComments":0 }, { "paper":{ "id":"2401.04099", "authors":[ { "_id":"659ccde599aeb9b5de3a65e3", "user":{ "avatarUrl":"/avatars/b657180c7666735062782edd4f6a69c9.svg", "isPro":false, "fullname":"Dejia Xu", "user":"ir1d", "type":"user" }, "name":"Dejia Xu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:30:01.342Z", "hidden":false }, { "_id":"659ccde599aeb9b5de3a65e4", "name":"Ye Yuan", "hidden":false }, { "_id":"659ccde599aeb9b5de3a65e5", "user":{ "avatarUrl":"/avatars/dca5f6b3e867ed28f80882ea2b32c2c8.svg", "isPro":false, "fullname":"morteza mardani", "user":"mortezamardani", "type":"user" }, "name":"Morteza Mardani", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:30:30.967Z", "hidden":false }, { "_id":"659ccde599aeb9b5de3a65e6", "user":{ "avatarUrl":"/avatars/c553bff4bd52b9a4f79e9c76fa22e27e.svg", "isPro":false, "fullname":"Sifei Liu", "user":"zwrq", "type":"user" }, "name":"Sifei Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:30:58.222Z", "hidden":false }, { "_id":"659ccde599aeb9b5de3a65e7", "user":{ "avatarUrl":"/avatars/2bf162889fa726ed18cc205b3f28609e.svg", "isPro":false, "fullname":"Jiaming Song", "user":"jiamings", "type":"user" }, "name":"Jiaming Song", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:31:05.712Z", "hidden":false }, { "_id":"659ccde599aeb9b5de3a65e8", "name":"Zhangyang Wang", "hidden":false }, { "_id":"659ccde599aeb9b5de3a65e9", "user":{ "avatarUrl":"/avatars/b22db0823311f866c00db2efc4b9f814.svg", "isPro":false, "fullname":"Arash Vahdat", "user":"avahdat", "type":"user" }, "name":"Arash Vahdat", "status":"admin_assigned", "statusLastChangedAt":"2024-01-09T09:31:39.970Z", "hidden":false } ], "publishedAt":"2024-01-08T18:56:33.000Z", "title":"AGG: Amortized Generative 3D Gaussians for Single Image to 3D", "summary":"Given the growing need for automatic 3D content creation pipelines, various\n3D representations have been studied to generate 3D objects from a single\nimage. Due to its superior rendering efficiency, 3D Gaussian splatting-based\nmodels have recently excelled in both 3D reconstruction and generation. 3D\nGaussian splatting approaches for image to 3D generation are often\noptimization-based, requiring many computationally expensive score-distillation\nsteps. To overcome these challenges, we introduce an Amortized Generative 3D\nGaussian framework (AGG) that instantly produces 3D Gaussians from a single\nimage, eliminating the need for per-instance optimization. Utilizing an\nintermediate hybrid representation, AGG decomposes the generation of 3D\nGaussian locations and other appearance attributes for joint optimization.\nMoreover, we propose a cascaded pipeline that first generates a coarse\nrepresentation of the 3D data and later upsamples it with a 3D Gaussian\nsuper-resolution module. Our method is evaluated against existing\noptimization-based 3D Gaussian frameworks and sampling-based pipelines\nutilizing other 3D representations, where AGG showcases competitive generation\nabilities both qualitatively and quantitatively while being several orders of\nmagnitude faster. Project page: https://ir1d.github.io/AGG/", "upvotes":5 }, "publishedAt":"2024-01-09T04:39:01.616Z", "title":"AGG: Amortized Generative 3D Gaussians for Single Image to 3D", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/fLZHvWHG-syKOGXJxkOxT.mp4", "numComments":0 }, { "paper":{ "id":"2401.08565", "authors":[ { "_id":"65a772455c58475cf91ca68d", "user":{ "avatarUrl":"/avatars/b95954a88c7b15524a3d7776a4abe083.svg", "isPro":false, "fullname":"Alisa Liu", "user":"alisawuffles", "type":"user" }, "name":"Alisa Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:42:45.304Z", "hidden":false }, { "_id":"65a772455c58475cf91ca68e", "user":{ "avatarUrl":"/avatars/be03c997aeddee49d4df0e233b125fce.svg", "isPro":false, "fullname":"Xiaochuang Han", "user":"xhan77", "type":"user" }, "name":"Xiaochuang Han", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:42:51.194Z", "hidden":false }, { "_id":"65a772455c58475cf91ca68f", "user":{ "avatarUrl":"/avatars/858ce56df314107cb63920d1a511b146.svg", "isPro":false, "fullname":"Yizhong Wang", "user":"yizhongw", "type":"user" }, "name":"Yizhong Wang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:43:01.186Z", "hidden":false }, { "_id":"65a772455c58475cf91ca690", "name":"Yulia Tsvetkov", "hidden":false }, { "_id":"65a772455c58475cf91ca691", "user":{ "avatarUrl":"/avatars/52e54bdd6a1fb6c774a40cd70f3d7925.svg", "isPro":false, "fullname":"Yejin Choi", "user":"yejinchoinka", "type":"user" }, "name":"Yejin Choi", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:43:14.472Z", "hidden":false }, { "_id":"65a772455c58475cf91ca692", "name":"Noah A. Smith", "hidden":false } ], "publishedAt":"2024-01-16T18:49:55.000Z", "title":"Tuning Language Models by Proxy", "summary":"Despite the general capabilities of large pretrained language models, they\nconsistently benefit from further adaptation to better achieve desired\nbehaviors. However, tuning these models has become increasingly\nresource-intensive, or impossible when model weights are private. We introduce\nproxy-tuning, a lightweight decoding-time algorithm that operates on top of\nblack-box LMs to achieve the result of directly tuning the model, but by\naccessing only its prediction over the output vocabulary. Our method instead\ntunes a smaller LM, then applies the difference between the predictions of the\nsmall tuned and untuned LMs to shift the original predictions of the base model\nin the direction of tuning, while retaining the benefits of larger scale\npretraining. In experiments, when we apply proxy-tuning to Llama2-70B using\nproxies of only 7B size, we can close 88% of the gap between Llama2-70B and its\ntruly-tuned chat version, when evaluated across knowledge, reasoning, and\nsafety benchmarks. Interestingly, when tested on TruthfulQA, proxy-tuned models\nare actually more truthful than directly tuned models, possibly because\ndecoding-time guidance better retains the model's factual knowledge. We then\ndemonstrate the generality of proxy-tuning by applying it for domain adaptation\non code, and task-specific finetuning on question-answering and math problems.\nOur work demonstrates the promise of using small tuned LMs to efficiently\ncustomize large, potentially proprietary LMs through decoding-time guidance.", "upvotes":4 }, "publishedAt":"2024-01-17T06:23:03.113Z", "title":"Tuning Language Models by Proxy", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/6FYDePLkNlKGpx8IsD-Ju.png", "numComments":0 }, { "paper":{ "id":"2401.05749", "authors":[ { "_id":"65a0a536b1f0788359a997c5", "name":"Brian Thompson", "hidden":false }, { "_id":"65a0a536b1f0788359a997c6", "name":"Mehak Preet Dhaliwal", "hidden":false }, { "_id":"65a0a536b1f0788359a997c7", "user":{ "avatarUrl":"/avatars/63e5101040020f6750c2738c360b5dcf.svg", "isPro":false, "fullname":"Peter Frisch", "user":"pengpengpete", "type":"user" }, "name":"Peter Frisch", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:17:24.437Z", "hidden":false }, { "_id":"65a0a536b1f0788359a997c8", "user":{ "avatarUrl":"/avatars/839ea6d233f92bd83fcf22493bbbc020.svg", "isPro":false, "fullname":"Tobias Domhan", "user":"tdomhan", "type":"user" }, "name":"Tobias Domhan", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:17:34.723Z", "hidden":false }, { "_id":"65a0a536b1f0788359a997c9", "user":{ "avatarUrl":"/avatars/679a31005c8e4ef561dfe85786ddafc0.svg", "isPro":false, "fullname":"Marcello Federico", "user":"marcfede", "type":"user" }, "name":"Marcello Federico", "status":"admin_assigned", "statusLastChangedAt":"2024-01-12T10:17:42.132Z", "hidden":false } ], "publishedAt":"2024-01-11T08:56:13.000Z", "title":"A Shocking Amount of the Web is Machine Translated: Insights from\n Multi-Way Parallelism", "summary":"We show that content on the web is often translated into many languages, and\nthe low quality of these multi-way translations indicates they were likely\ncreated using Machine Translation (MT). Multi-way parallel, machine generated\ncontent not only dominates the translations in lower resource languages; it\nalso constitutes a large fraction of the total web content in those languages.\nWe also find evidence of a selection bias in the type of content which is\ntranslated into many languages, consistent with low quality English content\nbeing translated en masse into many lower resource languages, via MT. Our work\nraises serious concerns about training models such as multilingual large\nlanguage models on both monolingual and bilingual data scraped from the web.", "upvotes":4 }, "publishedAt":"2024-01-12T02:34:30.518Z", "title":"A Shocking Amount of the Web is Machine Translated: Insights from Multi-Way Parallelism", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/Z51JXTqXsyptcAmT7ZDga.png", "numComments":0 }, { "paper":{ "id":"2401.07004", "authors":[ { "_id":"65a77c2921943858bde10da1", "user":{ "avatarUrl":"/avatars/475852ee335c81b85a9a63dd917082fc.svg", "isPro":false, "fullname":"Yikai Zhang", "user":"Arist12", "type":"user" }, "name":"Yikai Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:44:57.954Z", "hidden":false }, { "_id":"65a77c2921943858bde10da2", "user":{ "avatarUrl":"/avatars/f8d7fd57905d99b3fe56e950febba9b6.svg", "isPro":false, "fullname":"Junlong Li", "user":"lockon", "type":"user" }, "name":"Junlong Li", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:46:21.036Z", "hidden":false }, { "_id":"65a77c2921943858bde10da3", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1661715958139-6144a0c4ff1146bbd84d9865.png?w=200&h=200&f=face", "isPro":true, "fullname":"Pengfei Liu", "user":"Pengfei", "type":"user" }, "name":"Pengfei Liu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:46:04.774Z", "hidden":false } ], "publishedAt":"2024-01-13T07:57:01.000Z", "title":"Extending LLMs' Context Window with 100 Samples", "summary":"Large Language Models (LLMs) are known to have limited extrapolation ability\nbeyond their pre-trained context window, constraining their application in\ndownstream tasks with lengthy inputs. Recent studies have sought to extend\nLLMs' context window by modifying rotary position embedding (RoPE), a popular\nposition encoding method adopted by well-known LLMs such as LLaMA, PaLM, and\nGPT-NeoX. However, prior works like Position Interpolation (PI) and YaRN are\nresource-intensive and lack comparative experiments to assess their\napplicability. In this work, we identify the inherent need for LLMs' attention\nentropy (i.e. the information entropy of attention scores) to maintain\nstability and introduce a novel extension to RoPE which combines adjusting\nRoPE's base frequency and scaling the attention logits to help LLMs efficiently\nadapt to a larger context window. We validate the superiority of our method in\nboth fine-tuning performance and robustness across different context window\nsizes on various context-demanding tasks. Notably, our method extends the\ncontext window of LLaMA-2-7B-Chat to 16,384 with only 100 samples and 6\ntraining steps, showcasing extraordinary efficiency. Finally, we also explore\nhow data compositions and training curricula affect context window extension\nfor specific downstream tasks, suggesting fine-tuning LLMs with lengthy\nconversations as a good starting point. We release our code and SFT data at\nhttps://github.com/GAIR-NLP/Entropy-ABF.", "upvotes":3 }, "publishedAt":"2024-01-17T07:05:15.258Z", "title":"Extending LLMs' Context Window with 100 Samples", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/Qkvd3tER1WtHEyZa69or6.png", "numComments":0 }, { "paper":{ "id":"2401.07049", "authors":[ { "_id":"65a76cce0637ea5cccef525d", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/noauth/hNCtOOd5Sh_r-HaWg-GaL.png?w=200&h=200&f=face", "isPro":false, "fullname":"Michael Kölle", "user":"michaelkoelle", "type":"user" }, "name":"Michael Kölle", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:46:40.646Z", "hidden":false }, { "_id":"65a76cce0637ea5cccef525e", "name":"Gerhard Stenzel", "hidden":false }, { "_id":"65a76cce0637ea5cccef525f", "name":"Jonas Stein", "hidden":false }, { "_id":"65a76cce0637ea5cccef5260", "name":"Sebastian Zielinski", "hidden":false }, { "_id":"65a76cce0637ea5cccef5261", "name":"Björn Ommer", "hidden":false }, { "_id":"65a76cce0637ea5cccef5262", "name":"Claudia Linnhoff-Popien", "hidden":false } ], "publishedAt":"2024-01-13T11:38:08.000Z", "title":"Quantum Denoising Diffusion Models", "summary":"In recent years, machine learning models like DALL-E, Craiyon, and Stable\nDiffusion have gained significant attention for their ability to generate\nhigh-resolution images from concise descriptions. Concurrently, quantum\ncomputing is showing promising advances, especially with quantum machine\nlearning which capitalizes on quantum mechanics to meet the increasing\ncomputational requirements of traditional machine learning algorithms. This\npaper explores the integration of quantum machine learning and variational\nquantum circuits to augment the efficacy of diffusion-based image generation\nmodels. Specifically, we address two challenges of classical diffusion models:\ntheir low sampling speed and the extensive parameter requirements. We introduce\ntwo quantum diffusion models and benchmark their capabilities against their\nclassical counterparts using MNIST digits, Fashion MNIST, and CIFAR-10. Our\nmodels surpass the classical models with similar parameter counts in terms of\nperformance metrics FID, SSIM, and PSNR. Moreover, we introduce a consistency\nmodel unitary single sampling architecture that combines the diffusion\nprocedure into a single step, enabling a fast one-step image generation.", "upvotes":3 }, "publishedAt":"2024-01-17T05:59:47.036Z", "title":"Quantum Denoising Diffusion Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/gNu8XvUl1TTl3e4AzQgua.png", "numComments":0 }, { "paper":{ "id":"2401.04283", "authors":[ { "_id":"659e1c93da61c174cc1c2d1a", "name":"Yang Liu", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d1b", "name":"Li Wan", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d1c", "name":"Yun Li", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d1d", "name":"Yiteng Huang", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d1e", "name":"Ming Sun", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d1f", "name":"James Luan", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d20", "name":"Yangyang Shi", "hidden":false }, { "_id":"659e1c93da61c174cc1c2d21", "name":"Xin Lei", "hidden":false } ], "publishedAt":"2024-01-08T23:38:04.000Z", "title":"FADI-AEC: Fast Score Based Diffusion Model Guided by Far-end Signal for\n Acoustic Echo Cancellation", "summary":"Despite the potential of diffusion models in speech enhancement, their\ndeployment in Acoustic Echo Cancellation (AEC) has been restricted. In this\npaper, we propose DI-AEC, pioneering a diffusion-based stochastic regeneration\napproach dedicated to AEC. Further, we propose FADI-AEC, fast score-based\ndiffusion AEC framework to save computational demands, making it favorable for\nedge devices. It stands out by running the score model once per frame,\nachieving a significant surge in processing efficiency. Apart from that, we\nintroduce a novel noise generation technique where far-end signals are\nutilized, incorporating both far-end and near-end signals to refine the score\nmodel's accuracy. We test our proposed method on the ICASSP2023 Microsoft deep\necho cancellation challenge evaluation dataset, where our method outperforms\nsome of the end-to-end methods and other diffusion based echo cancellation\nmethods.", "upvotes":3 }, "publishedAt":"2024-01-10T04:26:59.915Z", "title":"FADI-AEC: Fast Score Based Diffusion Model Guided by Far-end Signal for Acoustic Echo Cancellation", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/-e0NpWsZ-BImClJDV1J5w.png", "numComments":0 }, { "paper":{ "id":"2401.06951", "authors":[ { "_id":"65a77b83e94d40886a883bd3", "name":"Jiaheng Liu", "hidden":false }, { "_id":"65a77b83e94d40886a883bd4", "name":"Zhiqi Bai", "hidden":false }, { "_id":"65a77b83e94d40886a883bd5", "name":"Yuanxing Zhang", "hidden":false }, { "_id":"65a77b83e94d40886a883bd6", "name":"Chenchen Zhang", "hidden":false }, { "_id":"65a77b83e94d40886a883bd7", "name":"Yu Zhang", "hidden":false }, { "_id":"65a77b83e94d40886a883bd8", "user":{ "avatarUrl":"/avatars/97a57859d7d87a3a8f1bb41d32a72bc2.svg", "isPro":false, "fullname":"Ge Zhang", "user":"zhangysk", "type":"user" }, "name":"Ge Zhang", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:57:41.725Z", "hidden":false }, { "_id":"65a77b83e94d40886a883bd9", "name":"Jiakai Wang", "hidden":false }, { "_id":"65a77b83e94d40886a883bda", "name":"Haoran Que", "hidden":false }, { "_id":"65a77b83e94d40886a883bdb", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1653710384819-62919485a29097b211bc7b83.png?w=200&h=200&f=face", "isPro":false, "fullname":"YukangChen", "user":"Yukang", "type":"user" }, "name":"Yukang Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:58:28.966Z", "hidden":false }, { "_id":"65a77b83e94d40886a883bdc", "name":"Wenbo Su", "hidden":false }, { "_id":"65a77b83e94d40886a883bdd", "name":"Tiezheng Ge", "hidden":false }, { "_id":"65a77b83e94d40886a883bde", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/641a6895fb5ffff5ac79d593/dFR_ofjbqCrcqGa9R3MMq.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Jie Fu", "user":"bigaidream", "type":"user" }, "name":"Jie Fu", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:59:26.227Z", "hidden":false }, { "_id":"65a77b83e94d40886a883bdf", "user":{ "avatarUrl":"https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1662232951344-6313a86154e6e5d9f0f94e04.jpeg?w=200&h=200&f=face", "isPro":false, "fullname":"Wenhu Chen", "user":"wenhu", "type":"user" }, "name":"Wenhu Chen", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T09:59:32.941Z", "hidden":false }, { "_id":"65a77b83e94d40886a883be0", "name":"Bo Zheng", "hidden":false } ], "publishedAt":"2024-01-13T02:11:20.000Z", "title":"E^2-LLM: Efficient and Extreme Length Extension of Large Language Models", "summary":"Typically, training LLMs with long context sizes is computationally\nexpensive, requiring extensive training hours and GPU resources. Existing\nlong-context extension methods usually need additional training procedures to\nsupport corresponding long-context windows, where the long-context training\ndata (e.g., 32k) is needed, and high GPU training costs are assumed. To address\nthe aforementioned issues, we propose an Efficient and Extreme length extension\nmethod for Large Language Models, called E 2 -LLM, with only one training\nprocedure and dramatically reduced computation cost, which also removes the\nneed to collect long-context data. Concretely, first, the training data of our\nE 2 -LLM only requires a short length (e.g., 4k), which reduces the tuning cost\ngreatly. Second, the training procedure on the short training context window is\nperformed only once time, and we can support different evaluation context\nwindows at inference. Third, in E 2 - LLM, based on RoPE position embeddings,\nwe introduce two different augmentation methods on the scale and position index\nparameters for different samples in training. It aims to make the model more\nrobust to the different relative differences when directly interpolating the\narbitrary context length at inference. Comprehensive experimental results on\nmultiple benchmark datasets demonstrate the effectiveness of our E 2 -LLM on\nchallenging long-context tasks.", "upvotes":2 }, "publishedAt":"2024-01-17T07:02:28.203Z", "title":"E^2-LLM: Efficient and Extreme Length Extension of Large Language Models", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/r3URA2dtZIqbthOp1EP9Z.png", "numComments":0 }, { "paper":{ "id":"2401.07727", "authors":[ { "_id":"65a76e840f169670d78e92a8", "name":"Antoine Mercier", "hidden":false }, { "_id":"65a76e840f169670d78e92a9", "user":{ "avatarUrl":"/avatars/63520d7e81b08dc0ca468f73047117d6.svg", "isPro":false, "fullname":"Ramin Nakhli", "user":"raminnakhli", "type":"user" }, "name":"Ramin Nakhli", "status":"admin_assigned", "statusLastChangedAt":"2024-01-17T10:09:53.945Z", "hidden":false }, { "_id":"65a76e840f169670d78e92aa", "name":"Mahesh Reddy", "hidden":false }, { "_id":"65a76e840f169670d78e92ab", "name":"Rajeev Yasarla", "hidden":false }, { "_id":"65a76e840f169670d78e92ac", "name":"Hong Cai", "hidden":false }, { "_id":"65a76e840f169670d78e92ad", "name":"Fatih Porikli", "hidden":false }, { "_id":"65a76e840f169670d78e92ae", "name":"Guillaume Berger", "hidden":false } ], "publishedAt":"2024-01-15T14:41:15.000Z", "title":"HexaGen3D: StableDiffusion is just one step away from Fast and Diverse\n Text-to-3D Generation", "summary":"Despite the latest remarkable advances in generative modeling, efficient\ngeneration of high-quality 3D assets from textual prompts remains a difficult\ntask. A key challenge lies in data scarcity: the most extensive 3D datasets\nencompass merely millions of assets, while their 2D counterparts contain\nbillions of text-image pairs. To address this, we propose a novel approach\nwhich harnesses the power of large, pretrained 2D diffusion models. More\nspecifically, our approach, HexaGen3D, fine-tunes a pretrained text-to-image\nmodel to jointly predict 6 orthographic projections and the corresponding\nlatent triplane. We then decode these latents to generate a textured mesh.\nHexaGen3D does not require per-sample optimization, and can infer high-quality\nand diverse objects from textual prompts in 7 seconds, offering significantly\nbetter quality-to-latency trade-offs when comparing to existing approaches.\nFurthermore, HexaGen3D demonstrates strong generalization to new objects or\ncompositions.", "upvotes":0 }, "publishedAt":"2024-01-17T06:07:03.595Z", "title":"HexaGen3D: StableDiffusion is just one step away from Fast and Diverse Text-to-3D Generation", "mediaUrl":"https://cdn-uploads.huggingface.co/production/uploads/60f1abe7544c2adfd699860c/zZaNBA_hs0EQK9LwuIBic.png", "numComments":0 } ]