-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
774 lines (650 loc) · 64.9 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
<!DOCTYPE html>
<html lang="en"><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><![endif]-->
<title>Yoni Kasten's Homepage</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="A PhD student at the Weizmann institute of science, supervised by Prof. Ronen Basri." />
<meta name="author" content="Yoni Kasten" />
<!-- favicons -->
<!-- <link rel="shortcut icon" href="images/templatemo_favicon.ico"> -->
<!-- bootstrap core CSS -->
<link href="js/bootstrap.css" rel="stylesheet">
<!-- fancybox CSS -->
<link href="js/jquery.css" rel="stylesheet">
<!-- flex slider CSS -->
<link href="js/flexslider.css" rel="stylesheet">
<!-- custom styles for this template -->
<link href="js/templatemo_style.css" rel="stylesheet">
<!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
<script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
<![endif]-->
<style>
#cf3 {
align:left
position:relative;
width:240px;
margin:0 auto;
}
#cf3 img {
position:absolute;
left:0;
-webkit-transition: opacity 0.1s ease-in-out;
-moz-transition: opacity 0.1s ease-in-out;
-o-transition: opacity 0.1s ease-in-out;
transition: opacity 0.1s ease-in-out;
}
@keyframes cf3FadeInOut {
0% {
opacity:1;
}
45% {
opacity:1;
}
55% {
opacity:0;
}
100% {
opacity:0;
}
}
#cf3 img.top {
animation-name: cf3FadeInOut;
animation-timing-function: ease-in-out;
animation-iteration-count: infinite;
animation-duration: 1s;
animation-direction: alternate;
}
</style>
</head>
<body>
<header>
<div class="container">
<div class="row">
<div class="col-md-3 hidden-xs"></div>
<div class="col-xs-3 col-xs-offset-20 visible-xs">
<a href="#" id="mobile_menu"><span class="glyphicon glyphicon-align-justify"></span></a>
</div>
<div class="col-xs-24 visible-xs" id="mobile_menu_list">
<ul style="display: none;">
<li><a href="#templatemo_about" class="current">About</a></li>
<!-- <li><a href="#templatemo_slideshow">Slideshow</a></li> -->
<li><a href="#templatemo_publications">Publications</a></li>
</ul>
</div>
<div class="col-md-16 col-sm-18 hidden-xs" id="templatemo-nav-bar">
<ul class="nav navbar-right">
<li><a href="#templatemo_about" class="current">About</a></li>
<!-- <li><a href="#templatemo_slideshow">Slideshow</a></li> -->
<li><a href="#templatemo_publications">Publications</a></li>
</ul>
</div>
</div>
</div>
</header><!-- end of templatemo_header -->
<section id="templatemo_about">
<div class="container">
<div class="row">
<div class="col-md-2"></div>
<div id="my_photo" class="col-md-5 col-sm-7 col-xs-24">
<img src="images/profilenew.jpg" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Yoni Kasten</h2>
<p>
I am a Senior Research Scientist at NVIDIA Research. I'm interested in Computer Vision and Machine Learning. My research is mostly in the domain of 3D computer vision (e.g. Camera Localization, Structure From Motion and 3D reconstruction) and has recently focused on deep neural models for computer vision problems that involve geometry. I recently completed my PhD at Weizmann Institute of Science
under the supervision of <a href="http://www.weizmann.ac.il/math/ronen/home/" target="_blank">Prof. Ronen Basri</a> from the Department of Computer Science and Applied Mathematics at the Weizmann Institute of Science.
I did my B.Sc. in Electrical Engineering at the Hebrew University of Jerusalem, where I also did my M.Sc. in Computer Science under the supervision of <a href="https://www.cs.huji.ac.il/~peleg/" target="_blank">Prof. Shmuel Peleg</a> and <a href="https://www.cse.huji.ac.il/~werman/" target="_blank">Prof. Michael Werman. </a>
<font size=1></font></br>
<b>Email:</b> yonikasten <font color="grey">at</font> gmail <font color="grey">dot</font> com</br>
</p>
<p>
<h4>Teaching</h4>
<ul>
<li>2021/spring (WIS): Multiple View Geometry for Computer Vision Applications (lecturer) </li>
<li>2020/spring (WIS): Multiple View Geometry for Computer Vision Applications (lecturer) </li>
<li>2018/winter (WIS): Introduction to Computer Vision (TA) </li>
<li>2016/winter (HUJI): Image Processing (TA) </li>
<li>2015/winter (HUJI): Image Processing (TA) </li>
</ul>
</p>
</div>
</div><!-- end of row -->
</div>
</section><!-- end of templatemo_about -->
<section id="templatemo_publications">
<div class="container">
<hr>
<div class="row">
<h1>Publications</h1>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/tracks_to_4d.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Fast Encoder-Based 3D from Casual Videos via Point Track Processing</h2>
<p>
<b> Yoni Kasten </b> , Wuyue Lu, Haggai Maron <br>
<i> NeurIPS 2024 </i>
</p>
<a class="btn btn-default abstract" ptitle="This paper addresses the long-standing challenge of reconstructing 3D structures from videos with dynamic content. Current approaches to this problem were not designed to operate on casual videos recorded by standard cameras or require a long optimization time. Aiming to significantly improve the efficiency of previous approaches, we present TracksTo4D, a learning-based approach that enables inferring 3D structure and camera positions from dynamic content originating from casual videos using a single efficient feed-forward pass. To achieve this, we propose operating directly over 2D point tracks as input and designing an architecture tailored for processing 2D point tracks. Our proposed architecture is designed with two key principles in mind: (1) it takes into account the inherent symmetries present in the input point tracks data, and (2) it assumes that the movement patterns can be effectively represented using a low-rank approximation. TracksTo4D is trained in an unsupervised way on a dataset of casual videos utilizing only the 2D point tracks extracted from the videos, without any 3D supervision. Our experiments show that TracksTo4D can reconstruct a temporal point cloud and camera positions of the underlying video with accuracy comparable to state-of-the-art methods, while drastically reducing runtime by up to 95\%. We further show that TracksTo4D generalizes well to unseen videos of unseen semantic categories at inference time. "> Abstract</a>
<a href="https://arxiv.org/pdf/2404.07097" class="btn btn-default" target="_blank">Paper </a>
<a href="https://tracks-to-4d.github.io/" class="btn btn-default" target="_blank">Project page </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/diffusion_motion_transfer.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Space-Time Diffusion Features for Zero-Shot Text-Driven Motion Transfer</h2>
<p>
Danah Yatim*, Rafail Fridman*, Omer Bar-Tal, <b> Yoni Kasten </b> , Tali Dekel (*equal contribution)<br>
<i> CVPR 2024 </i>
</p>
<a class="btn btn-default abstract" ptitle="We present a new method for text-driven motion transfer - synthesizing a video that complies with an input text prompt describing the target objects and scene while maintaining an input video's motion and scene layout. Prior methods are confined to transferring motion across two subjects within the same or closely related object categories and are applicable for limited domains (e.g., humans). In this work, we consider a significantly more challenging setting in which the target and source objects differ drastically in shape and fine-grained motion characteristics (e.g., translating a jumping dog into a dolphin). To this end, we leverage a pre-trained and fixed text-to-video diffusion model, which provides us with generative and motion priors. The pillar of our method is a new space-time feature loss derived directly from the model. This loss guides the generation process to preserve the overall motion of the input video while complying with the target object in terms of shape and fine-grained motion traits. "> Abstract</a>
<a href="https://diffusion-motion-transfer.github.io/MotionEditing_arXiv.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://diffusion-motion-transfer.github.io/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/diffusion-motion-transfer/diffusion-motion-transfer" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/consistory.jpg" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Training-Free Consistent Text-to-Image Generation</h2>
<p>
Yoad Tewel, Omri Kaduri, Rinon Gal, <b> Yoni Kasten </b> , Lior Wolf, Gal Chechik, Yuval Atzmon<br>
<i> SIGGRAPH 2024 </i>
</p>
<a class="btn btn-default abstract" ptitle="Text-to-image models offer a new level of creative flexibility by allowing users to guide the image generation process through natural language. However, using these models to consistently portray the same subject across diverse prompts remains challenging. Existing approaches fine-tune the model to teach it new words that describe specific user-provided subjects or add image conditioning to the model. These methods require lengthy per-subject optimization or large-scale pre-training. Moreover, they struggle to align generated images with text prompts and face difficulties in portraying multiple subjects. Here, we present ConsiStory, a training-free approach that enables consistent subject generation by sharing the internal activations of the pretrained model. We introduce a subject-driven shared attention block and correspondence-based feature injection to promote subject consistency between images. Additionally, we develop strategies to encourage layout diversity while maintaining subject consistency. We compare ConsiStory to a range of baselines, and demonstrate state-of-the-art performance on subject consistency and text alignment, without requiring a single optimization step. Finally, ConsiStory can naturally extend to multi-subject scenarios, and even enable training-free personalization for common objects. "> Abstract</a>
<a href="https://arxiv.org/pdf/2402.03286" class="btn btn-default" target="_blank">Paper </a>
<a href="https://research.nvidia.com/labs/par/consistory/" class="btn btn-default" target="_blank">Project page </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/sds_complete.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Point-Cloud Completion with Pretrained Text-to-image Diffusion Models</h2>
<p>
<b> Yoni Kasten </b> , Ohad Rahamim, Gal Chechik<br>
<i> NeurIPS 2023 </i>
</p>
<a class="btn btn-default abstract" ptitle="Point-cloud data collected in real-world applications are often incomplete, because objects are being observed from specific viewpoints, which only capture one perspective. Data can also be incomplete due to occlusion and low-resolution sampling. Existing approaches to completion rely on training models with datasets of predefined objects to guide the completion of point clouds. Unfortunately, these approaches fail to generalize when tested on objects or real-world setups that are poorly represented in their training set. Here, we leverage recent advances in text-guided 3D shape generation, showing how to use image priors for generating 3D objects. We describe an approach called SDS-Complete that uses a pre-trained text-to-image diffusion model and leverages the text semantics of a given incomplete point cloud of an object, to obtain a complete surface representation. SDS-Complete can complete a variety of objects using test-time optimization without expensive collection of 3D data. We evaluate SDS-Complete on a collection of incomplete scanned objects, captured by real-world depth sensors and LiDAR scanners. We find that it effectively reconstructs objects that are absent from common datasets, reducing Chamfer loss by about 50% on average compared with current methods. "> Abstract</a>
<a href="https://proceedings.neurips.cc/paper_files/paper/2023/file/284afdc2309f9667d2d4fb9290235b0c-Paper-Conference.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://sds-complete.github.io/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/NVlabs/sds-complete" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/scene_scape.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>SceneScape: Text-Driven Consistent Scene Generation</h2>
<p>
Rafail Fridman*, Amit Abecasis*, <b> Yoni Kasten </b> , Tali Dekel (*equal contribution)<br>
<i> NeurIPS 2023 </i>
</p>
<a class="btn btn-default abstract" ptitle="We present a method for text-driven perpetual view generation -- synthesizing long-term videos of various scenes solely from an input text prompt describing the scene and camera poses. We introduce a novel framework that generates such videos in an online fashion by combining the generative power of a pre-trained text-to-image model with the geometric priors learned by a pre-trained monocular depth prediction model. To tackle the pivotal challenge of achieving 3D consistency, i.e., synthesizing videos that depict geometrically-plausible scenes, we deploy an online test-time training to encourage the predicted depth map of the current frame to be geometrically consistent with the synthesized scene. The depth maps are used to construct a unified mesh representation of the scene, which is progressively constructed along the video generation process. In contrast to previous works, which are applicable only to limited domains, our method generates diverse scenes, such as walkthroughs in spaceships, caves, or ice castles. "> Abstract</a>
<a href="https://arxiv.org/pdf/2302.01133" class="btn btn-default" target="_blank">Paper </a>
<a href="https://scenescape.github.io/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/RafailFridman/SceneScape" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/nfl.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Neural LiDAR Fields for Novel View Synthesis</h2>
<p>
Shengyu Huang, Zan Gojcic, Zian Wang, Francis Williams, <b> Yoni Kasten </b> , Sanja Fidler, Konrad Schindler, Or Litany <br>
<i> ICCV 2023 </i>
</p>
<a class="btn btn-default abstract" ptitle="We present Neural Fields for LiDAR (NFL), a method to optimise a neural field scene representation from LiDAR measurements, with the goal of synthesizing realistic LiDAR scans from novel viewpoints. NFL combines the rendering power of neural fields with a detailed, physically motivated model of the LiDAR sensing process, thus enabling it to accurately reproduce key sensor behaviors like beam divergence, secondary returns, and ray dropping. We evaluate NFL on synthetic and real LiDAR scans and show that it outperforms explicit reconstruct-then-simulate methods as well as other NeRF-style methods on LiDAR novel view synthesis task. Moreover, we show that the improved realism of the synthesized views narrows the domain gap to real scans and translates to better registration and semantic segmentation performance. "> Abstract</a>
<a href="https://nv-tlabs.github.io/nfl/assets/nfl_main.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://research.nvidia.com/labs/toronto-ai/nfl/" class="btn btn-default" target="_blank">Project page </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/calm.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>CALM: Conditional Adversarial Latent Models for Directable Virtual Characters</h2>
<p>
Chen Tessler, <b> Yoni Kasten </b> , Yunrong Guo, Shie Mannor, Gal Chechik, Xue Bin Peng <br>
<i> SIGGRAPH 2023 </i>
</p>
<a class="btn btn-default abstract" ptitle="In this work, we present Conditional Adversarial Latent Models (CALM), an approach for generating diverse and directable behaviors for user-controlled interactive virtual characters. Using imitation learning, CALM learns a representation of movement that captures the complexity and diversity of human motion, and enables direct control over character movements. The approach jointly learns a control policy and a motion encoder that reconstructs key characteristics of a given motion without merely replicating it. The results show that CALM learns a semantic motion representation, enabling control over the generated motions and style-conditioning for higher-level task training. Once trained, the character can be controlled using intuitive interfaces, akin to those found in video games. "> Abstract</a>
<a href="https://research.nvidia.com/labs/par/calm/assets/SIGGRAPH2023_CALM.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://research.nvidia.com/labs/par/calm/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/NVlabs/CALM" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/mug_set_propagated_x4_grid.png" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Neural Congealing: Aligning Images to a Joint Semantic Atlas</h2>
<p>
Dolev Ofri-Amar, Michal Geyer, <b> Yoni Kasten </b> , Tali Dekel <br>
<i> CVPR 2023 </i>
</p>
<a class="btn btn-default abstract" ptitle="We present Neural Congealing -- a zero-shot self-supervised framework for detecting and jointly aligning semantically-common content across a given set of images. Our approach harnesses the power of pre-trained DINO-ViT features to learn: (i) a joint semantic atlas -- a 2D grid that captures the mode of DINO-ViT features in the input set, and (ii) dense mappings from the unified atlas to each of the input images. We derive a new robust self-supervised framework that optimizes the atlas representation and mappings per image set, requiring only a few real-world images as input without any additional input information (e.g., segmentation masks). Notably, we design our losses and training paradigm to account only for the shared content under severe variations in appearance, pose, background clutter or other distracting objects. We demonstrate results on a plethora of challenging image sets including sets of mixed domains (e.g., aligning images depicting sculpture and artwork of cats), sets depicting related yet different object categories (e.g., dogs and tigers), or domains for which large-scale training data is scarce (e.g., coffee mugs). We thoroughly evaluate our method and show that our test-time optimization approach performs favorably compared to a state-of-the-art method that requires extensive training on large-scale datasets. "> Abstract</a>
<a href="https://arxiv.org/pdf/2302.03956.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://neural-congealing.github.io/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/dolev104/neural_congealing" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/semi_transparent_effects.png" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Text2LIVE: Text-Driven Layered Image and Video Editing</h2>
<p>
Omer Bar-Tal*, Dolev Ofri-Amar*, Rafail Fridman*, <b> Yoni Kasten </b> , Tali Dekel (*equal contribution)<br>
<i> ECCV 2022 </i>
</p>
<p style="color:red;"> Oral presentation </p>
<a class="btn btn-default abstract" ptitle="We present a method for zero-shot, text-driven appearance manipulation in natural images and videos. Specifically, given an input image or video and a target text prompt, our goal is to edit the appearance of existing objects (e.g., object's texture) or augment the scene with new visual effects (e.g., smoke, fire) in a semantically meaningful manner. Our framework trains a generator using an internal dataset of training examples, extracted from a single input (image or video and target text prompt), while leveraging an external pre-trained CLIP model to establish our losses. Rather than directly generating the edited output, our key idea is to generate an edit layer (color+opacity) that is composited over the original input. This allows us to constrain the generation process and maintain high fidelity to the original input via novel text-driven losses that are applied directly to the edit layer. Our method neither relies on a pre-trained generator nor requires user-provided edit masks. Thus, it can perform localized, semantic edits on high-resolution natural images and videos across a variety of objects and scenes. "> Abstract</a>
<a href="https://arxiv.org/pdf/2204.02491.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://text2live.github.io/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/omerbt/Text2LIVE" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/teaser_sil.png" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Learning to Estimate Multi-view Pose from Object Silhouettes</h2>
<p>
<b> Yoni Kasten </b> , True Price, David Geraghty, Jan-Michael Frahm <br>
<i> Recovering 6D Object Pose Workshop at ECCV 2022 </i>
</p>
<a class="btn btn-default abstract" ptitle="While Structure-from-Motion pipelines certainly have their success cases in the task of 3D object reconstruction from multiple images, they still fail on many common objects that lack distinctive texture or have complex appearance qualities. The central problem lies in 6DOF camera pose estimation for the source images: without the ability to obtain a good estimate of the epipolar geometries, all state-of-the-art methods will fail. Although alternative solutions exist for specific objects, general solutions have proved elusive. In this work, we revisit the notion that silhouette cues can provide reasonable constraints on multi-view pose configurations when texture and priors are unavailable. Specifically, we train a neural network to holistically predict camera poses and pose confidences for a given set of input silhouette images, with the hypothesis that the network will be able to learn cues for multi-view relationships in a data-driven way. We show that our network generalizes to unseen synthetic and real object instances under reasonable assumptions about the input pose distribution of the images, and that the estimates are suitable to initialize state-of-the-art 3D reconstruction methods."> Abstract</a>
<a href="https://link.springer.com/chapter/10.1007/978-3-031-25085-9_8" class="btn btn-default" target="_blank">Paper </a>
<a href="https://vimeo.com/showcase/9946695/video/768455972" class="btn btn-default" target="_blank">Presentation (by True Price) </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/teaser_lucia.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Layered Neural Atlases for Consistent Video Editing</h2>
<p>
<b> Yoni Kasten </b> , Dolev Ofri, Oliver Wang, Tali Dekel<br>
<i> SIGGRAPH Asia 2021 </i>
</p>
<a class="btn btn-default abstract" ptitle="We present a method that decomposes, and “unwraps”, an input video into a set of layered 2D atlases, each providing a unified representation of the appearance of an object (or background) over the video. For each pixel in the video, our method estimates its corresponding 2D coordinate in each of the atlases, giving us a consistent parameterization of the video, along with an associated alpha (opacity) value. Importantly, we design our atlases to be interpretable and semantic, which facilitates easy and intuitive editing in the atlas domain, with minimal manual work required. Edits applied to a single 2D atlas (or input video frame) are automatically and consistently mapped back to the original video frames, while preserving occlusions, deformation, and other complex scene effects such as shadows and reflections. Our method employs a coordinate-based Multilayer Perceptron (MLP) representation for mappings, atlases, and alphas, which are jointly optimized on a per-video basis, using a combination of video reconstruction and regularization losses. By operating purely in 2D, our method does not require any prior 3D knowledge about scene geometry or camera poses, and can handle complex dynamic real world videos. We demonstrate various video editing applications, including texture mapping, video style transfer, image-to-video texture transfer, and segmentation/labeling propagation, all automatically produced by editing a single 2D atlas image. "> Abstract</a>
<a href="https://arxiv.org/pdf/2109.11418.pdf" class="btn btn-default" target="_blank">Paper </a>
<a href="https://layered-neural-atlases.github.io/" class="btn btn-default" target="_blank">Project page </a>
<a href="https://github.com/ykasten/layered-neural-atlases" class="btn btn-default" target="_blank">Code </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/volsdf.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Volume Rendering of Neural Implicit Surfaces</h2>
<p>
Lior Yariv, Jiatao Gu, <b> Yoni Kasten </b>, Yaron Lipman<br>
<i> NeurIPS 2021 </i> <br>
<p style="color:red;"> Oral presentation </p>
</p>
<a class="btn btn-default abstract" ptitle="Neural volume rendering became increasingly popular recently due to its success in synthesizing novel views of a scene from a sparse set of input images. So far, the geometry learned by neural volume rendering techniques was modeled using a generic density function. Furthermore, the geometry itself was extracted using an arbitrary level set of the density function leading to a noisy, often low fidelity reconstruction. The goal of this paper is to improve geometry representation and reconstruction in neural volume rendering. We achieve that by modeling the volume density as a function of the geometry. This is in contrast to previous work modeling the geometry as a function of the volume density. In more detail, we define the volume density function as Laplace’s cumulative distribution function (CDF) applied to a signed distance function (SDF) representation. This simple density representation has three benefits: (i) it provides a useful inductive bias to the geometry learned in the neural volume rendering process; (ii) it facilitates a bound on the opacity approximation error, leading to an accurate sampling of the viewing ray. Accurate sampling is important to provide a precise coupling of geometry and radiance; and (iii) it allows efficient unsupervised disentanglement of shape and appearance in volume rendering. Applying this new density representation to challenging scene multiview datasets produced high quality geometry reconstructions, outperforming relevant baselines. Furthermore, switching shape and appearance between scenes is possible due to the disentanglement of the two."> Abstract</a>
<a href="https://arxiv.org/pdf/2106.12052.pdf" class="btn btn-default" target="_blank">Paper </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/iccv2021.png" alt=""><br>
<img src="images/four_optimization_results.gif" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Deep Permutation Equivariant Structure from Motion</h2>
<p>
Dror Moran*, Hodaya Koslowsky*, <b> Yoni Kasten </b> , Haggai Maron, Meirav Galun, Ronen Basri (*equal contribution)<br>
<i> ICCV 2021 </i> <br>
<p style="color:red;"> Oral presentation</p>
</p>
<a class="btn btn-default abstract" ptitle="Existing deep methods produce highly accurate 3D reconstructions in stereo and multiview stereo settings, i.e., when cameras are both internally and externally calibrated. Nevertheless, the challenge of simultaneous recovery of camera poses and 3D scene structure in multiview settings with deep networks is still outstanding. Inspired by projective factorization for Structure from Motion (SFM) and by deep matrix completion techniques, we propose a neural network architecture that, given a set of point tracks in multiple images of a static scene, recovers both the camera parameters and a (sparse) scene structure by minimizing an unsupervised reprojection loss. Our network architecture is designed to respect the structure of the problem: the sought output is equivariant to permutations of both cameras and scene points. Notably, our method does not require initialization of camera parameters or 3D point locations. We test our architecture in two setups: (1) single scene reconstruction and (2) learning from multiple scenes. Our experiments, conducted on a variety of datasets in both internally calibrated and uncalibrated settings, indicate that our method accurately recovers pose and structure, on par with classical state of the art methods. Additionally, we show that a pre-trained network can be used to reconstruct novel scenes using inexpensive fine-tuning with no loss of accuracy."> Abstract</a>
<a href="https://arxiv.org/pdf/2104.06703.pdf" class="btn btn-default" target="_blank">Paper </a><a href="https://github.com/drormoran/Equivariant-SFM" class="btn btn-default" target="_blank">Code</a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/hybrid.png" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>A hybrid global structure from motion method for synchronously
estimating global rotations and global translations</h2>
<p>Xin Wang, Teng Xiao, <b> Yoni Kasten </b> <p>
<tab1> </tab1>
<p><i>ISPRS Journal of Photogrammetry and Remote Sensing 2021</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="A hybrid global structure from motion method for synchronously estimating global rotations and global translations"
abstract="Over the last few decades, the methods of global image orientation, which is also called global SfM, have attracted a lot of attention from researchers, mainly thanks to its advantage of time efficiency. Based on the input of relative orientation results, most conventional global SfM methods employ a two-step strategy consisting of global rotation estimation and global translation estimation. This paper, on the contrary, introduces a hybrid global approach that intends to solve global rotations and translations synchronously, but hierarchically. To improve the robustness and time efficiency, we first propose a novel efficient method that is much faster than the previous approaches for extracting an optimal minimum cover of a connected image triplet set (OMCTS). The OMCTS makes all the available images contained in a minimum number of connected image triplets, as well as all of those selected triplets, satisfy the constraint that the three corresponding relative orientations are as compatible as possible to each other. In order to solve non-collinear triplets in the OMCTS, some fundamental characterizations of essential matrices in the multiple-image setting are used, and image pose parameters are then estimated via averaging the constrained essential matrices. For the collinear triplets, the above approach is invalid and the image pose parameters are then alternatively determined from the relative orientations using the depth of tie points from each individual local spatial intersection. Finally, all image orientations are moved to a common coordinate system by traversing the solved connected triplets using similarity transformations. Compared to the state-of-the-art global SfM methods, the performance and capability of the proposed hybrid approach are thoroughly demonstrated on various public datasets (mainly including ordered and unordered internet images, oblique aerial images, hard and complex datasets, etc.). ">Abstract</a>
<a href="https://www.researchgate.net/profile/Xin-Wang-305/publication/349394725_A_hybrid_global_structure_from_motion_method_for_synchronously_estimating_global_rotations_and_global_translations/links/602e23ba4585158939b080c9/A-hybrid-global-structure-from-motion-method-for-synchronously-estimating-global-rotations-and-global-translations.pdf" target="_blank" class="btn btn-default abstract" ptitle="A hybrid global structure from motion method for synchronously estimating global rotations and global translations">Paper</a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/skull.gif" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Multiview Neural Surface Reconstruction by Disentangling Geometry and Appearance</h2>
<p>Lior Yariv, <b> Yoni Kasten </b> , Dror Moran, Meirav Galun, Matan Atzmon, Ronen Basri, Yaron Lipman<p>
<tab1> </tab1>
<p><i>NeurIPS 2020</i></p>
</p>
<p style="color:red;"> Spotlight presentation</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Multiview Neural Surface Reconstruction by Disentangling Geometry and Appearance"
abstract="In this work we address the challenging problem of multiview 3D surface reconstruction. We introduce a neural network architecture that simultaneously learns the unknown geometry, camera parameters, and a neural renderer that approximates the light reflected from the surface towards the camera. The geometry is represented as a zero level-set of a neural network, while the neural renderer, derived from the rendering equation, is capable of (implicitly) modeling a wide set of lighting conditions and materials. We trained our network on real world 2D images of objects with different material properties, lighting conditions, and noisy camera initializations from the DTU MVS dataset. We found our model to produce state of the art 3D surface reconstructions with high fidelity, resolution and detail.">Abstract</a>
<a href="https://arxiv.org/pdf/2003.09852.pdf" target="_blank" class="btn btn-default abstract" ptitle="Multiview Neural Surface Reconstruction by Disentangling Geometry and Appearance">Paper</a>
<a href="https://lioryariv.github.io/idr/" target="_blank" class="btn btn-default abstract" ptitle="Multiview Neural Surface Reconstruction by Disentangling Geometry and Appearance">Project Page</a>
<a href="https://github.com/lioryariv/idr" target="_blank" class="btn btn-default abstract" ptitle="Multiview Neural Surface Reconstruction by Disentangling Geometry and Appearance">Code</a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/laplace.png" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>On the Similarity between the Laplace and Neural Tangent Kernels</h2>
<p>Amnon Geifman, Abhay Yadav, <b> Yoni Kasten </b> , Meirav Galun, David Jacobs, Ronen Basri<p>
<tab1> </tab1>
<p><i> NeurIPS 2020</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="On the Similarity between the Laplace and Neural Tangent Kernels"
abstract="Recent theoretical work has shown that massively overparameterized neural networks are equivalent to kernel regressors that use Neural Tangent Kernels(NTK). Experiments show that these kernel methods perform similarly to real neural networks. Here we show that NTK for fully connected networks is closely related to the standard Laplace kernel. We show theoretically that for normalized data on the hypersphere both kernels have the same eigenfunctions and their eigenvalues decay polynomially at the same rate, implying that their Reproducing Kernel Hilbert Spaces (RKHS) include the same sets of functions. This means that both kernels give rise to classes of functions with the same smoothness properties. The two kernels differ for data off the hypersphere, but experiments indicate that when data is properly normalized these differences are not significant. Finally, we provide experiments on real data comparing NTK and the Laplace kernel, along with a larger class of{\gamma}-exponential kernels. We show that these perform almost identically. Our results suggest that much insight about neural networks can be obtained from analysis of the well-known Laplace kernel, which has a simple closed-form.">Abstract</a>
<a href="https://arxiv.org/pdf/2007.01580.pdf" target="_blank" class="btn btn-default abstract" ptitle="On the Similarity between the Laplace and Neural Tangent Kernels">Paper</a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/miccaiw.png" alt="">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>End-To-End Convolutional Neural Network for 3D Reconstruction of Knee Bones From Bi-Planar X-Ray Images</h2>
<p>
<b> Yoni Kasten </b>*, Daniel Doktofsky*, Ilya Kovler* (*equal contribution)<br>
<i> International Workshop on Machine Learning for Medical Image Reconstruction at MICCAI 2020 </i> <br>
</p>
<a class="btn btn-default abstract" ptitle="We present an end-to-end Convolutional Neural Network (CNN) approach for 3D reconstruction of knee bones directly from two bi-planar X-ray images. Clinically, capturing the 3D models of the bones is crucial for surgical planning, implant fitting, and postoperative evaluation. X-ray imaging significantly reduces the exposure of patients to ionizing radiation compared to Computer Tomography (CT) imaging, and is much more common and inexpensive compared to Magnetic Resonance Imaging (MRI) scanners. However, retrieving 3D models from such 2D scans is extremely challenging. In contrast to the common approach of statistically modeling the shape of each bone, our deep network learns the distribution of the bones’ shapes directly from the training images. We train our model with both supervised and unsupervised losses using Digitally Reconstructed Radiograph (DRR) images generated from CT scans. To apply our model to X-Ray data, we use style transfer to transform between X-Ray and DRR modalities. As a result, at test time, without further optimization, our solution directly outputs a 3D reconstruction from a pair of bi-planar X-ray images, while preserving geometric constraints. Our results indicate that our deep learning model is very efficient, generalizes well and produces high quality reconstructions."> Abstract</a>
<a href="https://arxiv.org/pdf/2004.00871.pdf" class="btn btn-default" target="_blank">Paper </a>
</div>
</div><!-- end of row -->
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/icml2020.png" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Frequency Bias in Neural Networks for Input of Non-Uniform Density</h2>
<p>Ronen Basri, Meirav Galun, Amnon Geifman, David Jacobs, <b> Yoni Kasten </b> , Shira Kritchman (alphabetical order)<p>
<tab1> </tab1>
<p><i>ICML 2020</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Frequency Bias in Neural Networks for Input of Non-Uniform Density"
abstract="Recent works have partly attributed the generalization ability of over-parameterized neural networks to frequency bias -- networks trained with gradient descent on data drawn from a uniform distribution find a low frequency fit before high frequency ones. As realistic training sets are not drawn from a uniform distribution, we here use the Neural Tangent Kernel (NTK) model to explore the effect of variable density on training dynamics. Our results, which combine analytic and empirical observations, show that when learning a pure harmonic function of frequency ?, convergence at a point $\x \in \Sphere^{d-1}$ occurs in time $O(\kappa^d/p(\x))$ where $p(\x)$ denotes the local density at $\x$. Specifically, for data in $\Sphere^1$ we analytically derive the eigenfunctions of the kernel associated with the NTK for two-layer networks. We further prove convergence results for deep, fully connected networks with respect to the spectral decomposition of the NTK. Our empirical study highlights similarities and differences between deep and shallow networks in this model.">Abstract</a>
<a href="https://arxiv.org/pdf/2003.04560.pdf" target="_blank" class="btn btn-default abstract" ptitle="Frequency Bias in Neural Networks for Input of Non-Uniform Density">Paper</a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/cvpr2020.png" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Averaging Essential and Fundamental Matrices in Collinear Camera Settings</h2>
<p>Amnon Geifman*, <b> Yoni Kasten</b>*, Meirav Galun and Ronen Basri (*equal contribution)<p>
<tab1> </tab1>
<p><i>CVPR 2020</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Averaging Essential and Fundamental Matrices in Collinear Camera Settings"
abstract="Global methods to Structure from Motion have gained popularity in recent years. A significant drawback of global methods is their sensitivity to collinear camera settings. In this paper, we introduce an analysis and algorithms for averaging bifocal tensors (essential or fundamental matrices) when either subsets or all of the camera centers are collinear.
We provide a complete spectral characterization of bifocal tensors in collinear scenarios and further propose two averaging algorithms. The first algorithm uses rank constrained minimization to recover camera matrices in fully collinear settings. The second algorithm enriches the set of possibly mixed collinear and non-collinear cameras with additional, ''virtual cameras'', which are placed in general position, enabling the application of existing averaging methods to the enriched set of bifocal tensors. Our algorithms are shown to achieve state of the art results on various benchmarks that include autonomous car datasets and unordered image collections in both calibrated and unclibrated settings.">Abstract</a>
<a href="https://arxiv.org/pdf/1912.00254.pdf" target="_blank" class="btn btn-default abstract" ptitle="Averaging Essential and Fundamental Matrices in Collinear Camera Settings">Paper</a>
<a href="https://www.youtube.com/watch?v=P1stdNCttZY" class="btn btn-default" target="_blank">Video Lecture (Israel Computer Vision Day 2019) </a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/nips2019.PNG" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>The Convergence Rate of Neural Networks for</h2>
<h2>Learned Functions of Different Frequencies</h2>
<p>Ronen Basri, David Jacobs, <b> Yoni Kasten </b> and Shira Kritchman (alphabetical order)<p>
<tab1> </tab1>
<p><i>NeurIPS 2019</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="The Convergence Rate of Neural Networks for
Learned Functions of Different Frequencies"
abstract="We study the relationship between the speed at which a neural network learns a
function and the frequency of the function. We build on recent results that show
that the dynamics of overparameterized neural networks trained with gradient descent can be well approximated by a linear system. When normalized training
data is uniformly distributed on a hypersphere, the eigenfunctions of this linear
system are spherical harmonic functions. We derive the corresponding eigenvalues for each frequency after introducing a bias term in the model. This bias term
had been omitted from the linear network model without significantly affecting
previous theoretical results. However, we show theoretically and experimentally
that a shallow neural network without bias cannot learn simple, low frequency
functions with odd frequencies, in the limit of large amounts of data. Our results
enable us to make specific predictions of the time it will take a network with bias
to learn functions of varying frequency. These predictions match the behavior of
real shallow and deep networks.
">Abstract</a>
<a href="https://arxiv.org/pdf/1906.00425.pdf" target="_blank" class="btn btn-default abstract" ptitle="The Convergence Rate of Neural Networks for
Learned Functions of Different Frequencies">Paper</a>
<a href="https://github.com/ykasten/Convergence-Rate-NN-Different-Frequencies" class="btn btn-default" target="_blank">Code</a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/iccv2019.PNG" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Algebraic Characterization of Essential Matrices </h2>
<h2>and Their Averaging in Multiview Settings</h2>
<p><b> Yoni Kasten</b>*, Amnon Geifman*, Meirav Galun and Ronen Basri (*equal contribution)<p>
<tab1> </tab1>
<p><i>ICCV 2019</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Algebraic Characterization of Essential Matrices and Their Averaging
in Multiview Settings"
abstract="Essential matrix averaging, i.e., the task of recovering camera locations and orientations in calibrated, multiview settings, is a first step in global approaches to Euclidean structure from motion. A common approach to essential matrix averaging is to separately solve for camera orientations and subsequently for camera positions. This paper presents a novel approach that solves simultaneously for both camera orientations and positions. We offer a complete characterization of the algebraic conditions that enable a unique Euclidean reconstruction of $n$ cameras from a collection of $(^n_2)$ essential matrices. We next use these conditions to formulate essential matrix averaging as a constrained optimization problem, allowing us to recover a consistent set of essential matrices given a (possibly partial) set of measured essential matrices computed independently for pairs of images. We finally use the recovered essential matrices to determine the global positions and orientations of the $n$ cameras. We test our method on common SfM datasets, demonstrating high accuracy while maintaining efficiency and robustness, compared to existing methods.
">Abstract</a>
<a href="https://arxiv.org/pdf/1904.02663.pdf" target="_blank" class="btn btn-default abstract" ptitle="Algebraic Characterization of Essential Matrices and Their Averaging
in Multiview Settings">Paper</a>
<a href="https://github.com/amnonge/Multi-view-Essential-Matrix" class="btn btn-default" target="_blank">Code</a>
<a href="https://www.youtube.com/watch?v=P1stdNCttZY" class="btn btn-default" target="_blank">Video Lecture (Israel Computer Vision Day 2019) </a>
<!--<a href="https://github.com/YuvalBahat/Confidence_From_Invariance" class="btn btn-default">Code</a>-->
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/cvpr2019.PNG" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>GPSfM: Global Projective SFM Using Algebraic Constraints</h2>
<h2>on Multi-View Fundamental Matrices</h2>
<p><b> Yoni Kasten</b>*, Amnon Geifman*, Meirav Galun and Ronen Basri (*equal contribution)<p>
<tab1> </tab1>
<p><i>CVPR 2019</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="GPSfM: Global Projective SFM Using Algebraic Constraints on Multi-View Fundamental Matrices"
abstract="This paper addresses the problem of recovering projective camera matrices from collections of fundamental matrices in multiview settings. We make two main contributions. First, given ${n \choose 2}$ fundamental matrices computed for $n$ images, we provide a complete algebraic characterization in the form of conditions that are both necessary and sufficient to enabling the recovery of camera matrices. These conditions are based on arranging the fundamental matrices as blocks in a single matrix, called the $n$-view fundamental matrix, and characterizing this matrix in terms of the signs of its eigenvalues and rank structures. Secondly, we propose a concrete algorithm for projective structure-from-motion that utilizes this characterization. Given a complete or partial collection of measured fundamental matrices, our method seeks camera matrices that minimize a global algebraic error for the measured fundamental matrices. In contrast to existing methods, our optimization, without any initialization, produces a consistent set of fundamental matrices that corresponds to a unique set of cameras (up to a choice of projective frame). Our experiments indicate that our method achieves state of the art performance in both accuracy and running time.
">Abstract</a>
<a href="https://arxiv.org/pdf/1812.00426.pdf" target="_blank" class="btn btn-default abstract" ptitle="GPSfM: Global Projective SFM Using Algebraic Constraints on Multi-View Fundamental Matrices">Paper</a>
<a href="https://github.com/amnonge/GPSFM-code" class="btn btn-default" target="_blank">Code</a>
<a href="https://www.youtube.com/watch?v=P1stdNCttZY" class="btn btn-default" target="_blank">Video Lecture (Israel Computer Vision Day 2019) </a>
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/wacv2019.PNG" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Resultant Based Incremental Recovery of Camera Pose </h2>
<h2>from Pairwise Matches</h2>
<p><b> Yoni Kasten </b> , Meirav Galun and Ronen Basri <p>
<tab1> </tab1>
<p><i>WACV 2019</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Resultant Based Incremental Recovery of Camera Pose from Pairwise Matches"
abstract="Incremental (online) structure from motion pipelines seek to recover the camera matrix associated with an image I_n given n-1 images, I_1,...,I_n-1, whose camera matrices have already been recovered. In this paper, we introduce a novel solution to the six-point online algorithm to recover the exterior parameters associated with I_n. Our algorithm uses just six corresponding pairs of 2D points, extracted each from I_n and from any of the preceding n-1 images, allowing the recovery of the full six degrees of freedom of the n'th camera, and unlike common methods, does not require tracking feature points in three or more images. Our novel solution is based on constructing a Dixon resultant, yielding a solution method that is both efficient and accurate compared to existing solutions. We further use Bernstein's theorem to prove a tight bound on the number of complex solutions. Our experiments demonstrate the utility of our approach.
">Abstract</a>
<a href="https://arxiv.org/pdf/1901.09364.pdf" target="_blank" class="btn btn-default abstract" ptitle="Resultant Based Incremental Recovery of Camera Pose from Pairwise Matches">Paper</a>
<a href="https://github.com/ykasten/resultantCamPose" class="btn btn-default" target="_blank">Code</a>
<a href="https://www.youtube.com/watch?v=B_NzjQFZUN4" class="btn btn-default" target="_blank">Video Lecture (Israel Computer Vision Day 2018)</a>
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/icip2018.png" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Two View Constraints on the Epipoles from Few Correspondences </h2>
<p><b> Yoni Kasten </b> , Michael Werman <p>
<tab1> </tab1>
<p><i>ICIP 2018</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Two View Constraints on the Epipoles from Few Correspondences"
abstract="In general it requires at least 7 point correspondences to compute the fundamental matrix between views. We use the cross ratio invariance between corresponding epipolar lines, stemming from epipolar line homography, to derive a simple formulation for the relationship between epipoles and corresponding points. We show how it can be used to reduce the number of required points for the epipolar geometry when some information about the epipoles is available and demonstrate this with a buddy search app.
">Abstract</a>
<a href="https://arxiv.org/pdf/1810.09496.pdf" target="_blank" class="btn btn-default abstract" ptitle="Two View Constraints on the Epipoles from Few Correspondences">Paper</a>
<a href="buddy_search.html" class="btn btn-default" target="_blank">Buddy Search Web App (developed with Tomer Hacohen)</a>
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/eccv2016.gif" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Fundamental Matrices from Moving Objects Using Line Motion Barcodes </h2>
<p><b> Yoni Kasten </b> , Gil Ben-Artzi, Shmuel Peleg and Michael Werman <p>
<tab1> </tab1>
<p><i>ECCV 2016</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Fundamental Matrices from Moving Objects Using Line Motion Barcodes"
abstract="Computing the epipolar geometry between cameras with very different viewpoints is often very difficult. The appearance of objects can vary greatly, and it is difficult to find corresponding feature points. Prior methods searched for corresponding epipolar lines using points on the convex hull of the silhouette of a single moving object. These methods fail when the scene includes multiple moving objects. This paper extends previous work to scenes having multiple moving objects by using the ?otion Barcodes? a temporal signature of lines. Corresponding epipolar lines have similar motion barcodes, and candidate pairs of corresponding epipoar lines are found by the similarity of their motion barcodes. As in previous methods we assume that cameras are relatively stationary and that moving objects have already been extracted using background subtraction.
">Abstract</a>
<a href="https://arxiv.org/pdf/1607.07660.pdf" target="_blank" class="btn btn-default abstract" ptitle="Fundamental Matrices from Moving Objects Using Line Motion Barcodes">Paper</a>
</div>
</div>
<div class="row" id="templatemo_publications_LargeScaleBD">
<div class="col-md-1"></div>
<div class="col-md-5 col-sm-7 col-xs-24">
<img src="images/cvpr2016.png" alt="image 1">
</div>
<div class="col-md-1"></div>
<div class="col-md-16">
<h2>Camera Calibration From Dynamic Silhouettes Using Motion Barcodes </h2>
<p>Gil Ben-Artzi,<b> Yoni Kasten </b> , Shmuel Peleg and Michael Werman <p>
<tab1> </tab1>
<p><i>CVPR 2016</i></p>
</p>
<!--<a class="btn btn-default abstract" ptitle="Abstract will be available soon...">Abstract</a>-->
<a class="btn btn-default abstract" ptitle="Camera Calibration From Dynamic Silhouettes Using Motion Barcodes"
abstract="Computing the epipolar geometry between cameras with very different viewpoints is often problematic as matching points are hard to find. In these cases, it has been proposed to use information from dynamic objects in the scene for suggesting point and line correspondences. We propose a speed up of about two orders of magnitude, as well as an increase in robustness and accuracy, to methods computing epipolar geometry from dynamic silhouettes based on a new temporal signature, motion barcode for lines. This is a binary temporal sequence for lines, indicating for each frame the existence of at least one foreground pixel on that line. The motion barcodes of two corresponding epipolar lines are very similar so the search for corresponding epipolar lines can be limited to lines having similar barcodes leading to increased speed, accuracy, and robustness in computing the epipolar geometry.
">Abstract</a>
<a href="http://openaccess.thecvf.com/content_cvpr_2016/papers/Ben-Artzi_Camera_Calibration_From_CVPR_2016_paper.pdf" target="_blank" class="btn btn-default abstract" ptitle="Camera Calibration From Dynamic Silhouettes Using Motion Barcodes">Paper</a>
</div>
</div>
</div></section><!-- end of templatemo_publications -->
<br>
<br>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-76024733-2', 'auto');
ga('send', 'pageview');
</script>
<div id="lightbox" style="display:none;"><a href="#" class="lightbox-close lightbox-button"></a><div class="lightbox-nav" style="display: none;"><a href="#" class="lightbox-previous lightbox-button"></a><a href="#" class="lightbox-next lightbox-button"></a></div><div href="#" class="lightbox-caption"><p></p></div></div></body><!-- Mirrored from www.wisdom.weizmann.ac.il/~shaharko/ by HTTrack Website Copier/3.x [XR&CO'2014], Wed, 06 Apr 2016 11:56:56 GMT --></html><script id="f5_cspm">(function(){var f5_cspm={f5_p:'IOHAGAJCCJLFEHNFGLFAFJIDEHNLBIMHHIENOOIFILJIGLNIECMNPHFJBEPONDKELFKPFDBGPNABIAMBMBFBJOFECDNAOPGEAACPPLEJHOHNIKJKAABBIILKKENCEOAF',setCharAt:function(str,index,chr){if(index>str.length-1)return str;return str.substr(0,index)+chr+str.substr(index+1);},get_byte:function(str,i){var s=(i/16)|0;i=(i&15);s=s*32;return((str.charCodeAt(i+16+s)-65)<<4)|(str.charCodeAt(i+s)-65);},set_byte:function(str,i,b){var s=(i/16)|0;i=(i&15);s=s*32;str=f5_cspm.setCharAt(str,(i+16+s),String.fromCharCode((b>>4)+65));str=f5_cspm.setCharAt(str,(i+s),String.fromCharCode((b&15)+65));return str;},set_latency:function(str,latency){latency=latency&0xffff;str=f5_cspm.set_byte(str,48,(latency>>8));str=f5_cspm.set_byte(str,49,(latency&0xff));str=f5_cspm.set_byte(str,43,2);return str;},wait_perf_data:function(){try{var wp=window.performance.timing;if(wp.loadEventEnd>0){var res=wp.loadEventEnd-wp.navigationStart;if(res<60001){var cookie_val=f5_cspm.set_latency(f5_cspm.f5_p,res);window.document.cookie='f5avr0600570113aaaaaaaaaaaaaaaa='+encodeURIComponent(cookie_val)+';path=/';}
return;}}
catch(err){return;}
setTimeout(f5_cspm.wait_perf_data,100);return;},go:function(){var chunk=window.document.cookie.split(/\s*;\s*/);for(var i=0;i<chunk.length;++i){var pair=chunk[i].split(/\s*=\s*/);if(pair[0]=='f5_cspm'&&pair[1]=='1234')
{var d=new Date();d.setTime(d.getTime()-1000);window.document.cookie='f5_cspm=;expires='+d.toUTCString()+';path=/;';setTimeout(f5_cspm.wait_perf_data,100);}}}}
f5_cspm.go();}());</script>