@INPROCEEDINGS{Stoica01Chord_scalablepeer-to-peer, author = {Ion Stoica and Robert Morris and David Karger and M. Frans Kaashoek and Hari Balakrishnan}, title = {Chord: a scalable peer-to-peer lookup service for internet applications}, booktitle = {SIGCOMM '01: Proc. 2001 conference on Applications, Technologies, Architectures, and Protocols for Computer Communications}, year = {2001}, pages = {149--160}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {A fundamental problem that confronts peer-to-peer applications is to efficiently locate the node that stores a particular data item. This paper presents Chord, a distributed lookup protocol that addresses this problem. Chord provides support for just one operation: given a key, it maps the key onto a node. Data location can be easily implemented on top of Chord by associating a key with each data item, and storing the key/data item pair at the node to which the key maps. Chord adapts efficiently as nodes join and leave the system, and can answer queries even if the system is continuously changing. Results from theoretical analysis, simulations, and experiments show that Chord is scalable, with communication cost and the state maintained by each node scaling logarithmically with the number of Chord nodes.}, crossref = {`}, doi = {10.1145/383059.383071}, isbn = {1-58113-411-8}, location = {San Diego, California, United States}, url = {http://portal.acm.org/citation.cfm?id=383059.383071} } @ARTICLE{Tang03pSearch_informationretrieval, author = {Chunqiang Tang and Zhichen Xu and Mallik Mahalingam}, title = {{pSearch:} Information retrieval in structured overlays}, journal = {ACM SIGCOMM Computer Communication Review}, year = {2003}, volume = {33}, pages = {89--94}, number = {1}, abstract = {We describe an efficient peer-to-peer information retrieval system, pSearch; that supports state-of-the-art content- and semantic-based full-text searches. pSearch avoids the scalability problem of existing systems that employ centralized indexing, or index/query flooding. It also avoids the nondeterminism that is exhibited by heuristic-based approaches. In pSearch; documents in the network are organized around their vector representations (based on modern document ranking algorithms) such that the search space for a given query is organized around related documents, achieving both efficiency and accuracy.}, address = {New York, NY, USA}, doi = {10.1145/774763.774777}, issn = {0146-4833}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?id=774763.774777} } @ARTICLE{Ratnasamy01scalablecontent-addressablenetwork, author = {Sylvia Ratnasamy and Paul Francis and Mark Handley and Richard Karp and Scott Schenker}, title = {A scalable content-addressable network}, journal = {ACM SIGCOMM Computer Communication Review}, year = {2001}, volume = {31}, pages = {161--172}, number = {4}, abstract = {Hash tables - which map "keys" onto "values" - are an essential building block in modern software systems. We believe a similar functionality would be equally valuable to large distributed systems. In this paper, we introduce the concept of a Content-Addressable Network (CAN) as a distributed infrastructure that provides hash table-like functionality on Internet-like scales. The CAN is scalable, fault-tolerant and completely self-organizing, and we demonstrate its scalability, robustness and low-latency properties through simulation.}, address = {New York, NY, USA}, doi = {10.1145/964723.383072}, issn = {0146-4833}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?id=964723.383072} } @ARTICLE{Dabek01Wide-areacooperativestorage, author = {Frank Dabek and M. Frans Kaashoek and David Karger and Robert Morris and Ion Stoica}, title = {Wide-area cooperative storage with {CFS}}, journal = {SIGOPS Operating Systems Review}, year = {2001}, volume = {35}, pages = {202--215}, number = {5}, abstract = {The Cooperative File System (CFS) is a new peer-to-peer read-only storage system that provides provable guarantees for the efficiency, robustness, and load-balance of file storage and retrieval. CFS does this with a completely decentralized architecture that can scale to large systems. CFS servers provide a distributed hash table (DHash) for block storage. CFS clients interpret DHash blocks as a file system. DHash distributes and caches blocks at a fine granularity to achieve load balance, uses replication for robustness, and decreases latency with server selection. DHash finds blocks using the Chord location protocol, which operates in time logarithmic in the number of servers.CFS is implemented using the SFS file system toolkit and runs on Linux, OpenBSD, and FreeBSD. Experience on a globally deployed prototype shows that CFS delivers data to clients as fast as FTP. Controlled tests show that CFS is scalable: with 4,096 servers, looking up a block of data involves contacting only seven servers. The tests also demonstrate nearly perfect robustness and unimpaired performance even when as many as half the servers fail.}, address = {New York, NY, USA}, doi = {10.1145/502059.502054}, issn = {0163-5980}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?id=502059.502054} } @ARTICLE{Ghemawat03Googlefilesystem, author = {Sanjay Ghemawat and Howard Gobioff and Shun-Tak Leung}, title = {The {Google} file system}, journal = {SIGOPS Operating System Review}, year = {2003}, volume = {37}, pages = {29--43}, number = {5}, abstract = {We have designed and implemented the Google File System, a scalable distributed file system for large distributed data-intensive applications. It provides fault tolerance while running on inexpensive commodity hardware, and it delivers high aggregate performance to a large number of clients. While sharing many of the same goals as previous distributed file systems, our design has been driven by observations of our application workloads and technological environment, both current and anticipated, that reflect a marked departure from some earlier file system assumptions. This has led us to reexamine traditional choices and explore radically different design points. The file system has successfully met our storage needs. It is widely deployed within Google as the storage platform for the generation and processing of data used by our service as well as research and development efforts that require large data sets. The largest cluster to date provides hundreds of terabytes of storage across thousands of disks on over a thousand machines, and it is concurrently accessed by hundreds of clients. In this paper, we present file system interface extensions designed to support distributed applications, discuss many aspects of our design, and report measurements from both micro-benchmarks and real world use.}, address = {New York, NY, USA}, doi = {10.1145/1165389.945450}, issn = {0163-5980}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?doid=1165389.945450} } @ARTICLE{Kubiatowicz00OceanStore_architectureglobal-scale, author = {John Kubiatowicz and David Bindel and Yan Chen and Steven Czerwinski and Patrick Eaton and Dennis Geels and Ramakrishna Gummadi and Sean Rhea and Hakim Weatherspoon and Chris Wells and Ben Zhao}, title = {{OceanStore:} an architecture for global-scale persistent storage}, journal = {SIGARCH Computer Architecture News}, year = {2000}, volume = {28}, pages = {190--201}, number = {5}, abstract = {OceanStore is a utility infrastructure designed to span the globe and provide continuous access to persistent information. Since this infrastructure is comprised of untrusted servers, data is protected through redundancy and cryptographic techniques. To improve performance, data is allowed to be cached anywhere, anytime. Additionally, monitoring of usage patterns allows adaptation to regional outages and denial of service attacks; monitoring also enhances performance through pro-active movement of data. A prototype implementation is currently under development.}, address = {New York, NY, USA}, doi = {10.1145/378995.379239}, issn = {0163-5964}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?id=379239} } @INPROCEEDINGS{Mazi`eres99Separatingkeymanagement, author = {David Mazi\`{e}res and Michael Kaminsky and M. Frans Kaashoek and Emmett Witchel}, title = {Separating key management from file system security}, booktitle = {SOSP '99: Proc. 17th ACM symposium on Operating Systems Principles}, year = {1999}, volume = {33}, number = {5}, pages = {124--139}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {No secure network file system has ever grown to span the Internet. Existing systems all lack adequate key management for security at a global scale. Given the diversity of the Internet, any particular mechanism a file system employs to manage keys will fail to support many types of use.We propose separating key management from file system security, letting the world share a single global file system no matter how individuals manage keys. We present SFS, a secure file system that avoids internal key management. While other file systems need key management to map file names to encryption keys, SFS file names effectively contain public keys, making them self-certifying pathnames. Key management in SFS occurs outside of the file system, in whatever procedure users choose to generate file names.Self-certifying pathnames free SFS clients from any notion of administrative realm, making inter-realm file sharing trivial. They let users authenticate servers through a number of different techniques. The file namespace doubles as a key certification namespace, so that people can realize many key management schemes using only standard file utilities. Finally, with self-certifying pathnames, people can bootstrap one key management mechanism using another. These properties make SFS more versatile than any file system with built-in key management.}, doi = {10.1145/319344.319160}, isbn = {1-58113-140-2}, location = {Charleston, South Carolina, United States}, url = {http://portal.acm.org/citation.cfm?id=319344.319160} } @ARTICLE{Petersen97Flexibleupdatepropagation, author = {Karin Petersen and Mike J. Spreitzer and Douglas B. Terry and Marvin M. Theimer and Alan J. Demers}, title = {Flexible update propagation for weakly consistent replication}, journal = {ACM SIGOPS Operating Systems Review}, year = {1997}, volume = {31}, pages = {288--301}, number = {5}, abstract = {Bayou’s anti-entropy protocol for update propagation between weakly consistent storage replicas is based on pair-wise communication, the propagation of write operations, and a set of ordering and closure constraints on the propagation of the writes. The simplicity of the design makes the protocol very flexible, thereby providing support for diverse networking environments and usage scenarios. It accommodates a variety of policies for when and where to propagate updates. It operates over diverse network topologies, including low-bandwidth links. It is incremental. It enables replica convergence, and updates can be propagated using floppy disks and similar transportable media. Moreover, the protocol handles replica creation and retirement in a light-weight manner. Each of these features is enabled by only one or two of the protocol’s design choices, and can be independently incorporated in other systems. This paper presents the antientropy protocol in detail, describing the design decisions and resulting features.}, address = {New York, NY, USA}, doi = {10.1145/269005.266711}, issn = {0163-5980}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?doid=269005.266711} } @ARTICLE{Rowstron01Storagemanagementand, author = {Antony Rowstron and Peter Druschel}, title = {Storage management and caching in {PAST}, a large-scale, persistent peer-to-peer storage utility}, journal = {ACM SIGOPS Operating Systems Review}, year = {2001}, volume = {35}, pages = {188--201}, number = {5}, abstract = {This paper presents and evaluates the storage management and caching in PAST, a large-scale peer-to-peer persistent storage utility. PAST is based on a self-organizing, Internet-based overlay network of storage nodes that cooperatively route file queries, store multiple replicas of files, and cache additional copies of popular files.In the PAST system, storage nodes and files are each assigned uniformly distributed identifiers, and replicas of a file are stored at nodes whose identifier matches most closely the file's identifier. This statistical assignment of files to storage nodes approximately balances the number of files stored on each node. However, non-uniform storage node capacities and file sizes require more explicit storage load balancing to permit graceful behavior under high global storage utilization; likewise, non-uniform popularity of files requires caching to minimize fetch distance and to balance the query load.We present and evaluate PAST, with an emphasis on its storage management and caching system. Extensive trace-driven experiments show that the system minimizes fetch distance, that it balances the query load for popular files, and that it displays graceful degradation of performance as the global storage utilization increases beyond 95%.}, address = {New York, NY, USA}, doi = {10.1145/502059.502053}, issn = {0163-5980}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?id=502053} } @ARTICLE{Terry95Managingupdateconflicts, author = {Douglas B. Terry and Marvin M. Theimer and Karin Petersen and Alan J. Demers and Mike J. Spreitzer and Carl H. Hauser}, title = {Managing update conflicts in Bayou, a weakly connected replicated storage system}, journal = {ACM SIGOPS Operating Systems Review}, year = {1995}, volume = {29}, pages = {172--182}, number = {5}, abstract = {Bayou is a replicated, weakly consmtent storage system designed for a mobile computing environment that includes portable machines with less than ideal network connectivity. To maximize availabdity, users can read and write any accessible replica. Bayou’s design has focused on supporting apphcation-specific mechanisms to detect and resolve the update conflicts that naturally arise in such a system, ensuring that replicas move towards eventual consistency, and defining a protocol by which the resolution of update conflicts stabilizes, It includes novel methods for confhct detection, called dependency checks, and per-write conflict resolution based on client-provided merge procedures. To guarantee eventual consistency, Bayou servers must be able to rollback the effects of previously executed writes and redo them according to a global serialization order. Furthermore, Bayou per-mits clients to observe the results of all writes received by a server, mchrding tentative writes whose conflicts have not been ultimately resolved. This paper presents the motivation for and design of these mechanisms and describes the experiences gained with an initial implementation of the system.}, address = {New York, NY, USA}, doi = {10.1145/224057.224070}, issn = {0163-5980}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?doid=224057.224070} } @ARTICLE{cCetintemel03Deno_DecentralizedPeer-to-Peer, author = {Ugur \c{C}etintemel and Peter J. Keleher and Bobby Bhattacharjee and Michael J. Franklin}, title = {Deno: A decentralized, peer-to-peer object-replication system for weakly connected environments}, journal = {IEEE Transactions on Computers}, year = {2003}, volume = {52}, pages = {943--959}, number = {7}, abstract = {This paper presents the design, implementation, and evaluation of the replication framework of Deno, a decentralized, peer-to-peer object-replication system targeted for weakly connected environments. Deno uses weighted voting for availability and pair-wise, epidemic information flow for flexibility. This combination allows the protocols to operate with less than full connectivity, to easily adapt to changes in group membership, and to make few assumptions about the underlying network topology. We present two versions of Deno's protocol that differ in the consistency levels they support. We also propose security extensions to handle a class of malicious actions that involve misrepresentation of protocol information. Deno has been implemented and runs on top of Linux and Win32 platforms. We use the Deno prototype to characterize the performance of the Deno protocols and extensions. Our study reveals several interesting results that provide fundamental insight into the benefits of decentralization and the mechanics of epidemic protocols.}, address = {Washington, DC, USA}, doi = {10.1109/TC.2003.1214342}, issn = {0018-9340}, publisher = {IEEE Computer Society}, url = {http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1214342} } @INPROCEEDINGS{Adya02Farsite_federatedavailable, author = {Atul Adya and William J. Bolosky and Miguel Castro and Gerald Cermak and Ronnie Chaiken and John R. Douceur and Jon Howell and Jacob R. Lorch and Marvin Theimer and Roger P. Wattenhofer}, title = {Farsite: federated, available, and reliable storage for an incompletely trusted environment}, booktitle = {OSDI '02: Proc. 5th symposium on Operating systems design and implementation}, year = {2002}, pages = {1--14}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {Farsite is a secure, scalable file system that logically functions as a centralized file server but is physically distributed among a set of untrusted computers. Farsite provides file availability and reliability through randomized replicated storage; it ensures the secrecy of file contents with cryptographic techniques; it maintains the integrity of file and directory data with a Byzantine-fault-tolerant protocol; it is designed to be scalable by using a distributed hint mechanism and delegation certificates for pathname translations; and it achieves good performance by locally caching file data, lazily propagating file updates, and varying the duration and granularity of content leases. We report on the design of Farsite and the lessons we have learned by implementing much of that design.}, doi = {10.1145/1060289.1060291}, location = {Boston, Massachusetts}, url = {http://portal.acm.org/citation.cfm?id=1060291} } @INPROCEEDINGS{Aguilera03Block-LevelSecurityNetwork-Attached, author = {Marcos K. Aguilera and Minwen Ji and Mark Lillibridge and John MacCormick and Erwin Oertli and Dave Andersen and Mike Burrows and Timothy Mann and Chandramohan A. Thekkath}, title = {Block-level security for network-attached disks}, booktitle = {FAST '03: Proc. 2nd USENIX conference on File and Storage Technologies}, year = {2003}, pages = {159--174}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {We propose a practical and efficient method for adding security to network-attached disks (NADs). In contrast to previous work, our design requires no changes to the data layout on disk, minimal changes to existing NADs, and only small changes to the standard protocol for accessing remote block-based devices. Thus, existing NAD file systems and storage-management software could incorporate our scheme very easily. Our design enforces security using the well-known idea of self-describing capabilities, with two novel features that limit the need for memory on secure NADs: a scheme to manage revocations based on capability groups, and a replay-detection method using Bloom filters. We have implemented a prototype NAD file system, called Snapdragon, that incorporates our ideas. We evaluated Snapdragon's performance and scalability. The overhead of access control is small: latency for reads and writes increases by less than 0.5 ms (5%), while bandwidth decreases by up to 16%. The aggregate throughput scales linearly with the number of NADs (up to 7 in our experiments).}, location = {San Francisco, CA}, url = {http://www.usenix.org/publications/library/proceedings/fast03/tech/aguilera.html} } @INPROCEEDINGS{Aguilera07Sinfonia_newparadigm, author = {Marcos~K. Aguilera and Arif Merchant and Mehul Shah and Alistair Veitch and Christos Karamanolis}, title = {Sinfonia: A new paradigm for building scalable distributed systems}, booktitle = {SOSP '07: Proc. 21st ACM SIGOPS symposium on Operating Systems Principles}, year = {2007}, pages = {159--174}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {We propose a new paradigm for building scalable distributed systems. Our approach does not require dealing with message-passing protocols -- a major complication in existing distributed systems. Instead, developers just design and manipulate data structures within our service called Sinfonia. Sinfonia keeps data for applications on a set of memory nodes, each exporting a linear address space. At the core of Sinfonia is a novel minitransaction primitive that enables efficient and consistent access to data, while hiding the complexities that arise from concurrency and failures. Using Sinfonia, we implemented two very different and complex applications in a few months: a cluster file system and a group communication service. Our implementations perform well and scale to hundreds of machines.}, doi = {10.1145/1294261.1294278}, isbn = {978-1-59593-591-5}, location = {Stevenson, Washington, USA}, url = {http://portal.acm.org/citation.cfm?doid=1294261.1294278} } @INPROCEEDINGS{Belaramani06PRACTIReplication, author = {Nalini Moti Belaramani and Michael Dahlin and Lei Gao and Amol Nayate and Arun Venkataramani and Praveen Yalagandula and Jiandan Zheng}, title = {{PRACTI} replication}, booktitle = {NSDI '06: Proc. 3rd USENIX symposium on Networked Systems Design \& Implementation}, year = {2006}, pages = {59--72}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {We present PRACTI, a new approach for large-scale replication. PRACTI systems can replicate or cache any subset of data on any node (Partial Replication), provide a broad range of consistency guarantees (Arbitrary Consistency), and permit any node to send information to any other node (Topology Independence). A PRACTI architecture yields two signi?cant advantages. First, by providing all three PRACTI properties, it enables better trade-offs than existing mechanisms that support at most two of the three desirable properties. The PRACTI approach thus exposes new points in the design space for replication systems. Second, the ?exibility of PRACTI protocols simpli?es the design of replication systems by allowing a single architecture to subsume a broad range of existing systems and to reduce development costs for new ones. To illustrate both advantages, we use our PRACTI prototype to emulate existing server replication, client-server, and object replication systems and to implement novel policies that improve performance for mobile users, web edge servers, and grid computing by as much as an order of magnitude.}, location = {San Jose, CA}, url = {http://www.usenix.org/events/nsdi06/tech/belaramani.html} } @INPROCEEDINGS{Bhattacharjee03Efficientpeer-to-peersearches, author = {Bobby Bhattacharjee and Sudarshan Chawathe and Vijay Gopalakrishnan and Pete Keleher and Bujor Silaghi}, title = {Efficient peer-to-peer searches using result-caching}, booktitle = {IPTPS '03: Proc. 2nd international workshop on Peer-To-Peer Systems}, year = {2003}, volume = {2735}, series = {Lecture Notes in Computer Science}, pages = {225--236}, publisher = {Springer Berlin / Heidelberg}, abstract = {Existing peer-to-peer systems implement a single function well: data lookup. There is now a wealth of research describing how to reliably disseminate, and to later retrieve, data in a scalable and load-balanced manner.}, doi = {10.1007/b11823}, url = {http://www.springerlink.com/content/qd4tt3dp6fagvtxn} } @INPROCEEDINGS{Castro99PracticalByzantinefault, author = {Miguel Castro and Barbara Liskov}, title = {Practical {Byzantine} fault tolerance}, booktitle = {IPTPS '99: Proc. 3rd symposium on Operating Systems Design and Implementation}, year = {1999}, pages = {173--186}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {This paper describes a new replication algorithm that is able to tolerate Byzantine faults. We believe that Byzantine-fault-tolerant algorithms will be increasingly important in the future because malicious attacks and software errors are increasingly common and can cause faulty nodes to exhibit arbitrary behavior. Whereas previous algorithms assumed a synchronous system or were too slow to be used in practice, the algorithm described in this paper is practical: it works in asynchronous environments like the Internet and incorporates several important optimizations that improve the response time of previous algorithms by more than an order of magnitude. We implemented a Byzantine-fault-tolerant NFS service using our algorithm and measured its performance. The results show that our service is only 3% slower than a standard unreplicated NFS.}, isbn = {1-880446-39-1}, location = {New Orleans, Louisiana, United States}, url = {http://www.pmg.lcs.mit.edu/~castro/osdi99_html/osdi99.html} } @INPROCEEDINGS{Cetintemel00Performanceofmobile, author = {Ugur Cetintemel and Peter Keleher}, title = {Performance of mobile, single-object, replication protocols}, booktitle = {SRDS '00: Proc. 19th IEEE symposium on Reliable Distributed Systems}, year = {2000}, pages = {218--227}, abstract = {Discusses the implementation and performance of bounded voting, which is a new object replication protocol designed for use in mobile and weakly-connected environments. We show that the protocol eliminates several restrictions of previous work, such as the need for (1) strong or complete connectivity, (2) complete knowledge of system membership, and (3) low update rates. The protocol implements an asynchronous, weighted-voting scheme via epidemic information flow, and commits updates in an entirely decentralized fashion. A proxy mechanism is used to enable transparent handling of planned disconnections. We use a detailed simulation study to characterize the performance of bounded voting under a variety of loads and environments, and to compare it to another decentralized epidemic protocol. We further investigate the performance impact of the proxy mechanism}, doi = {10.1109/RELDI.2000.885409}, keywords = {mobile computing, object-oriented databases, protocols, replicated databases, software performance evaluation, asynchronous weighted-voting scheme, bounded voting, connectivity, decentralized epidemic protocol, decentralized update commitment, epidemic information flow, loads, mobile single-object replication protocols, performance, planned disconnections, proxy mechanism, simulation, system membership, transparent handling, update rates, weakly-connected environments}, url = {http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=885409} } @INPROCEEDINGS{Chun06EfficientReplicaMaintenance, author = {Byung-Gon Chun and Frank Dabek and Andreas Haeberlen and Emil Sit and Hakim Weatherspoon and M. Frans Kaashoek and John Kubiatowicz and Robert Morris}, title = {Efficient replica maintenance for distributed storage systems}, booktitle = {NSDI '06: Proc. 3rd USENIX symposium on Networked Systems Design \& Implementation}, year = {2006}, abstract = {This paper considers replication strategies for storage systems that aggregate the disks of many nodes spread over the Internet. Maintaining replication in such systems can be prohibitively expensive, since every transient network or host failure could potentially lead to copying a server's worth of data over the Internet to maintain replication levels. The following insights in designing an efficient replication algorithm emerge from the paper's analysis. First, durability can be provided separately from availability; the former is less expensive to ensure and a more useful goal for many wide-area applications. Second, the focus of a durability algorithm must be to create new copies of data objects faster than permanent disk failures destroy the objects; careful choice of policies for what nodes should hold what data can decrease repair time. Third, increasing the number of replicas of each data object does not help a system tolerate a higher disk failure probability, but does help tolerate bursts of failures. Finally, ensuring that the system makes use of replicas that recover after temporary failure is critical to efficiency. Based on these insights, the paper proposes the Carbonite replication algorithm for keeping data durable at a low cost. A simulation of Carbonite storing 1 TB of data over a 365 day trace of PlanetLab activity shows that Carbonite is able to keep all data durable and uses 44% more network traffic than a hypothetical system that only responds to permanent failures. In comparison, Total Recall and DHash require almost a factor of two more network traffic than this hypothetical system.}, url = {http://www.usenix.org/events/nsdi06/tech/chun.html} } @INPROCEEDINGS{Clarke01Freenet_distributedanonymous, author = {Ian Clarke and Oskar Sandberg and Brandon Wiley and Theodore Hong}, title = {Freenet: A distributed anonymous information storage and retrieval system}, booktitle = {Designing Privacy Enhancing Technologies: international workshop on Design Issues in Anonymity and Unobservability}, year = {2001}, volume = {2009/2001}, series = {Lecture Notes in Computer Science}, pages = {46--66}, publisher = {Springer Berlin / Heidelberg}, abstract = {We describe Freenet, an adaptive peer-to-peer network application that permits the publication, replication, and retrieval of data while protecting the anonymity of both authors and readers. Freenet operates as a network of identical nodes that collectively pool their storage space to store data files and cooperate to route requests to the most likely physical location of data. No broadcast search or centralized location index is employed. Files are referred to in a location-independent manner, and are dynamically replicated in locations near requestors and deleted from locations where there is no interest. It is infeasible to discover the true origin or destination of a file passing through the network, and dificult for a node operator to determine or be held responsible for the actual physical contents of her own node.}, doi = {10.1007/3-540-44702-4_4}, url = {http://www.springerlink.com/content/tmu95yypt1rd9pct/} } @INPROCEEDINGS{Cox03Samsara_honoramong, author = {Landon P. Cox and Brian D. Noble}, title = {Samsara: honor among thieves in peer-to-peer storage}, booktitle = {SOSP '03: Proc. 19th ACM symposium on Operating Systems Principles}, year = {2003}, pages = {120--132}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {Peer-to-peer storage systems assume that their users consume resources in proportion to their contribution. Unfortunately, users are unlikely to do this without some enforcement mechanism. Prior solutions to this problem require centralized infrastructure, constraints on data placement, or ongoing administrative costs. All of these run counter to the design philosophy of peer-to-peer systems.Samsara enforces fairness in peer-to-peer storage systems without requiring trusted third parties, symmetric storage relationships, monetary payment, or certified identities. Each peer that requests storage of another must agree to hold a claim in return---a placeholder that accounts for available space. After an exchange, each partner checks the other to ensure faithfulness. Samsara punishes unresponsive nodes probabilistically. Because objects are replicated, nodes with transient failures are unlikely to suffer data loss, unlike those that are dishonest or chronically unavailable. Claim storage overhead can be reduced when necessary by forwarding among chains of nodes, and eliminated when cycles are created. Forwarding chains increase the risk of exposure to failure, but such risk is modest under reasonable assumptions of utilization and simultaneous, persistent failure.}, doi = {10.1145/945445.945458}, isbn = {1-58113-757-5}, location = {Bolton Landing, NY, USA}, url = {http://portal.acm.org/citation.cfm?id=945458} } @INPROCEEDINGS{Dean04MapReduce_simplifieddata, author = {Jeffrey Dean and Sanjay Ghemawat}, title = {{MapReduce:} Simplified data processing on large clusters}, booktitle = {OSDI '04: Proc. 6th symposium on Operating Systems Design \& Implementation}, year = {2004}, pages = {10--10}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {MapReduce is a programming model and an associated implementation for processing and generating large data sets. Users specify a _map_ function that processes a key/value pair to generate a set of intermediate key/value pairs, and a _reduce_ function that merges all intermediate values associated with the same intermediate key. Many real world tasks are expressible in this model, as shown in the paper. Programs written in this functional style are automatically parallelized and executed on a large cluster of commodity machines. The run-time system takes care of the details of partitioning the input data, scheduling the program's execution across a set of machines, handling machine failures, and managing the required inter-machine communication. This allows programmers without any experience with parallel and distributed systems to easily utilize the resources of a large distributed system. Our implementation of MapReduce runs on a large cluster of commodity machines and is highly scalable: a typical MapReduce computation processes many terabytes of data on thousands of machines. Programmers find the system easy to use: hundreds of MapReduce programs have been implemented and upwards of one thousand MapReduce jobs are executed on Google's clusters every day.}, location = {San Francisco, CA}, url = {http://www.usenix.org/events/osdi04/tech/dean.html} } @INPROCEEDINGS{DeCandia07Dynamo_Amazon'shighly, author = {Giuseppe DeCandia and Deniz Hastorun and Madan Jampani and Gunavardhan Kakulapati and Avinash Lakshman and Alex Pilchin and Swaminathan Sivasubramanian and Peter Vosshall and Werner Vogels}, title = {Dynamo: {Amazon's} highly available key-value store}, booktitle = {SOSP '07: Proc. 21st ACM SIGOPS symposium on Operating systems principles}, year = {2007}, pages = {205--220}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {Reliability at massive scale is one of the biggest challenges we face at Amazon.com, one of the largest e-commerce operations in the world; even the slightest outage has significant financial consequences and impacts customer trust. The Amazon.com platform, which provides services for many web sites worldwide, is implemented on top of an infrastructure of tens of thousands of servers and network components located in many datacenters around the world. At this scale, small and large components fail continuously and the way persistent state is managed in the face of these failures drives the reliability and scalability of the software systems. This paper presents the design and implementation of Dynamo, a highly available key-value storage system that some of Amazon's core services use to provide an "always-on" experience. To achieve this level of availability, Dynamo sacrifices consistency under certain failure scenarios. It makes extensive use of object versioning and application-assisted conflict resolution in a manner that provides a novel interface for developers to use.}, doi = {10.1145/1294261.1294281}, isbn = {978-1-59593-591-5}, location = {Stevenson, Washington, USA}, url = {http://portal.acm.org/citation.cfm?doid=1294261.1294281} } @INPROCEEDINGS{Ding04Peer-to-peerfile-sharingover, author = {G. Ding and Bharat Bhargava}, title = {Peer-to-peer file-sharing over mobile ad hoc networks}, booktitle = {Proc. 2nd IEEE annual conference on Pervasive Computing and Communications Workshops}, year = {2004}, pages = {104--108}, month = {14--17 March }, abstract = {Current peer-to-peer file-sharing systems mostly work on wired networks. Mobile ad hoc network is characterized as multihop wireless communications between mobile devices. Five routing approaches with different complexity are proposed to enable peer-to-peer file-sharing over mobile ad hoc networks. The complexity of the proposed approaches is evaluated and compared. It is concluded that the cross-layer protocols perform better than simply overlaying peer-to-peer searching protocol on mobile ad hoc networks.}, doi = {10.1109/PERCOMW.2004.1276914}, url = {http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=1276914} } @INPROCEEDINGS{Freedman02IntroducingTarzanPeer-to-Peer, author = {Michael J. Freedman and Emil Sit and Josh Cates and Robert Morris}, title = {Introducing {Tarzan}, a peer-to-peer anonymizing network layer}, booktitle = {IPTPS '02: Proc. 1st international workshop on Peer-to-Peer Systems}, year = {2002}, abstract = {We introduce Tarzan, a peer-to-peer anonymous network layer that provides generic IP forwarding. Unlike prior anonymizing layers, Tarzan is flexible, transparent, decentralized, and highly scalable. Tarzan achieves these properties by building anonymous IP tunnels between an open-ended set of peers. Tarzan can provide anonymity to existing applications, such as web browsing and file sharing, without change to those applications. Performance tests show that Tarzan imposes minimal overhead over a corresponding non-anonymous overlay route.}, url = {http://www.cs.rice.edu/Conferences/IPTPS02/} } @INPROCEEDINGS{Garriss06RE_Reliableemail, author = {Scott Garriss and Michael Kaminsky and Michael~J. Freedman and Brad Karp and David Mazi\`{e}res and Haifeng Yu}, title = {RE: reliable email}, booktitle = {NSDI '06: Proc. 3rd USENIX symposium on Networked Systems Design \& Implementation}, year = {2006}, pages = {297-310}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {The explosive growth in unwanted email has prompted the development of techniques for the rejection of email, intended to shield recipients from the onerous task of identifying the legitimate email in their inboxes amid a sea of spam. Unfortunately, widely used content-based filtering systems have converted the spam problem into a false positive one: email has become unreliable. Email acceptance techniques complement rejection ones; they can help prevent false positives by filing email into a user's inbox before it is considered for rejection. Whitelisting, whereby recipients accept email from some set of authorized senders, is one such acceptance technique. We present Reliable Email (RE:), a new whitelisting system that incurs zero false positives among socially connected users. Unlike previous whitelisting systems, which require that whitelists be populated manually, RE: exploits friend-of-friend relationships among email correspondents to populate whitelists automatically. To do so, RE: permits an email's recipient to discover whether other email users have whitelisted the email's sender, while preserving the privacy of users' email contacts with cryptographic private matching techniques. Using real email traces from two sites, we demonstrate that RE: renders a significant fraction of received email reliable. Our evaluation also shows that RE: can prevent up to 88% of the false positives incurred by a widely deployed email rejection system, at modest computational cost.}, location = {San Jose, CA}, url = {http://www.usenix.org/events/nsdi06/tech/garriss.html} } @INPROCEEDINGS{Goldberg98TowardsarchivalIntermemory, author = {Andrew V. Goldberg and Peter N. Yianilos}, title = {Towards an archival Intermemory}, booktitle = {ADL '98: Proc. IEEE international forum on Research and Technology Advances in Digital Libraries}, year = {1998}, pages = {147--156}, month = {22--24 April }, abstract = {We propose a self-organizing archival Intermemory. That is, a noncommercial subscriber-provided distributed information storage service built on the existing Internet. Given an assumption of continued growth in the memory's total size, a subscriber's participation for only a finite time can nevertheless ensure archival preservation of the subscriber's data. Information disperses through the network over time and memories become more difficult to erase as they age. The probability of losing an old memory given random node failures is vanishingly small-and an adversary would have to corrupt hundreds of thousands of nodes to destroy a very old memory. This paper presents a framework for the design of an Intermemory, and considers certain aspects of the design in greater detail. In particular, the aspects of addressing, space efficiency, and redundant coding are discussed}, doi = {10.1109/ADL.1998.670389}, url = {http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=670389} } @INPROCEEDINGS{Gopalakrishnan04Adaptivereplicationin, author = {Vijay Gopalakrishnan and Bujor Silaghi and Bobby Bhattacharjee and Pete Keleher}, title = {Adaptive replication in peer-to-peer systems}, booktitle = {ICDCS '04: Proc. 24th international conference on Distributed Computing Systems}, year = {2004}, pages = {360--369}, abstract = {Peer-to-peer systems can be used to form a low-latency decentralized data delivery system. Structured peer-to-peer systems provide both low latency and excellent load balance with uniform query and data distributions. Under the more common skewed access distributions, however, individual nodes are easily overloaded, resulting in poor global performance and lost messages. This paper describes a lightweight, adaptive, and system-neutral replication protocol, called LAR, that maintains low access latencies and good load balance even under highly skewed demand. We apply LAR to Chord and show that it has lower overhead and better performance than existing replication strategies.}, doi = {10.1109/ICDCS.2004.1281601}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1281601} } @INPROCEEDINGS{Harjula04Plug-and-playapplicationplatform_, author = {Erkki Harjula and Mika Ylianttila and Jussi Ala-Kurikka and Jukka Riekki and Jaakko Sauvola}, title = {Plug-and-play application platform: Towards mobile peer-to-peer}, booktitle = {MUM '04: Proc. 3rd international conference on Mobile and Ubiquitous Multimedia}, year = {2004}, pages = {63--69}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {While peer-to-peer (P2P) has emerged as a new hot communication concept among the Internet users, mobile usage of P2P applications is still taking its first steps. This article first elaborates the evolutionary process that P2P architectures are going through. Challenges and requirements for mobile P2P are then identified, followed by a definition of a novel Plug-and-Play Application Platform (PnPAP). This platform enables dynamic selections between diverse P2P and session management protocols while preserving the best available network connectivity through Holistic Connectivity (HCon) management. On-the-fly reconfiguration and run-time parameter optimization can be done with a lightweight interpretable state machine. The concept enables flexible and seamless communications for mobile devices in P2P networks.}, doi = {10.1145/1052380.1052390}, isbn = {1-58113-981-0}, location = {College Park, Maryland}, url = {http://portal.acm.org/citation.cfm?doid=1052380.1052390} } @INPROCEEDINGS{Hazel02Achord_Variantof, author = {Steven Hazel and Brandon Wiley}, title = {Achord: A variant of the {Chord} lookup service for use in censorship resistant peer-to-peer publishing systems}, booktitle = {IPTPS '02: Proc. 1st international workshop on Peer-to-Peer Systems}, year = {2002}, abstract = {Any peer-to-peer publishing system must provide a mechanism for efficiently locating published documents. For censorship resistant systems, it is particularly important that the lookup mechanism be difficult to disable or abuse. Chord [1] is a promising distributed lookup mechanism, because analysis has provided certain useful guarantees about the speed and correctness of Chord's operation. We examine the suitability of Chord for building censorship resistant peer-to-peer publishing systems, and suggest Achord, a variant of the Chord mechanism which takes into account the additional requirements imposed by the goal of censorship resistance.}, url = {http://www.thalassocracy.org/achord/achord-iptps.html} } @INPROCEEDINGS{Kaminsky03Decentralizeduserauthentication, author = {Michael Kaminsky and George Savvides and David Mazi\`{e}res and M. Frans Kaashoek}, title = {Decentralized user authentication in a global file system}, booktitle = {SOSP '03: Proc. 19th ACM symposium on Operating Systems Principles}, year = {2003}, pages = {60--73}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {The challenge for user authentication in a global file system is allowing people to grant access to specific users and groups in remote administrative domains, without assuming any kind of pre-existing administrative relationship. The traditional approach to user authentication across administrative domains is for users to prove their identities through a chain of certificates. Certificates allow for general forms of delegation, but they often require more infrastructure than is necessary to support a network file system.This paper introduces an approach without certificates. Local authentication servers pre-fetch and cache remote user and group definitions from remote authentication servers. During a file access, an authentication server can establish identities for users based just on local information. This approach is particularly well-suited to file systems, and it provides a simple and intuitive interface that is similar to those found in local access control mechanisms. An implementation of the authentication server and a file server supporting access control lists demonstrate the viability of this design in the context of the Self-certifying File System (SFS). Experiments demonstrate that the authentication server can scale to groups with tens of thousands of members.}, doi = {10.1145/945445.945452}, isbn = {1-58113-757-5}, location = {Bolton Landing, NY, USA}, url = {http://portal.acm.org/citation.cfm?id=945445.945452} } @INPROCEEDINGS{Keleher99Decentralizedreplicated-objectprotocols, author = {Peter J. Keleher}, title = {Decentralized replicated-object protocols}, booktitle = {PODC '99: Proc. eighteenth annual ACM symposium on Principles of distributed computing}, year = {1999}, pages = {143--151}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {We describe a new replicated-object protocol designed for use in mobile and weakly-connected environments. The protocol differs from previous protocols in combining epidemic information propagation with voting, and in using fixed per-object currencies for voting. The advantage of epidemic protocols is that data movement only requires pairwise communication. Hence, there is no need for a majority quorum to be available and simultaneously connected at any single time. The protocols increase availability by using voting, rather than primary copy or primary commit schemes. Finally, the use of per-object voting currencies allows votes to take place in an entirely decentralized fashion, without any server having complete knowledge of group membership. We show that currency allocation can be used to implement diverse policies. For example, uniform currency distributions emulate traditional dynamic voting schemes, while allocating all currency to a single server emulates a primary-copy scheme. We present simulation results showing both schemes, as well as the performance advantages of using currency proxies to temporarily reallocate currency during planned disconnections.}, doi = {10.1145/301308.301345}, isbn = {1-58113-099-6}, location = {Atlanta, Georgia, United States}, url = {http://portal.acm.org/citation.cfm?id=301308.301345} } @INPROCEEDINGS{Ledlie03Scoopedagain, author = {Jonathan Ledlie and Jeff Shneidman and Margo Seltzer and John Huth}, title = {Scooped, again}, booktitle = {IPTPS '03: Proc. 2nd international workshop on Peer-To-Peer Systems}, year = {2003}, volume = {2735}, series = {Lecture Notes in Computer Science}, pages = {129--138}, publisher = {Springer Berlin / Heidelberg}, abstract = {The Peer-to-Peer (p2p) and Grid infrastructure communities are tackling an overlapping set of problems. In addressing these problems, p2p solutions are usually motivated by elegance or research interest. Grid researchers, under pressure from thousands of scientists with real file sharing and computational needs, are pooling their solutions from a wide range of sources in an attempt to meet user demand. Driven by this need to solve large scientific problems quickly, the Grid is being constructed with the tools at hand: FTP or RPC for data transfer, centralization for scheduling and authentication, and an assumption of correct, obediant nodes. If history is any guide, the World Wide Web depicts viscerally that systems that address user needs can have enormous staying power and affect future research. The Grid infrastructure is a great customer waiting for future p2p products. By no means should we make them our only customers, but we should at least put them on the list. If p2p research does not at least address the Grid, it may eventually be sidelined by defacto distributed algorithms that are less elegant but were used to solve Grid problems. In essense, we'll have been scooped, again.}, doi = {10.1007/b11823}, url = {http://www.springerlink.com/content/vlja6rjgaq4vtavt} } @INPROCEEDINGS{Lee03Cooperativepeergroups, author = {Seungjoon Lee and Rob Sherwood and Bobby Bhattacharjee}, title = {Cooperative peer groups in {NICE}}, booktitle = {INFOCOM '03: Proc. 22nd annual joint conference of the IEEE Computer and Communications Societies}, year = {2003}, volume = {2}, pages = {1272--1282}, month = {30 March--3 April }, abstract = {We present a distributed scheme for trust inference in peer-to-peer networks. Our work is in the context of the NICE system, which is a platform for implementing cooperative applications over the Internet. We describe a technique for efficiently storing user reputation information in a completely decentralized manner, and show how this information can be used to efficiently identify non-cooperative users in NICE. We present a simulation-based study of our algorithms, in which we show our scheme scales to thousands of users using modest amounts of storage, processing, and bandwidth at any individual node. Lastly we show that our scheme is robust and can form cooperative groups in systems where the vast majority of users are malicious.}, keywords = {Internet, inference mechanisms, information retrieval Internet, NICE system, cooperative system, decentralized manner, distributed algorithm, individual node, malicious users, noncooperative users identification, peer-to-peer networks, platform, reputation information storage, trust inference}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1208963} } @INPROCEEDINGS{Li03FeasibilityofPeer-to-Peer, author = {Jinyang Li and Boon Loo and Joseph Hellerstein and M. Kaashoek and David Karger and Robert Morris}, title = {On the feasibility of peer-to-peer web indexing and search}, booktitle = {IPTPS '03: Proc. 2nd international workshop on Peer-To-Peer Systems}, year = {2003}, volume = {2735}, series = {Lecture Notes in Computer Science}, pages = {207--215}, publisher = {Springer Berlin / Heidelberg}, abstract = {This paper discusses the feasibility of peer-to-peer full-text keyword search of the Web. Two classes of keyword search techniques are in use or have been proposed: flooding of queries over an overlay network (as in Gnutella), and intersection of index lists stored in a distributed hash table. We present a simple feasibility analysis based on the resource constraints and search workload. Our study suggests that the peer-to-peer network does not have enough capacity to make naive use of either of search techniques attractive for Web search. The paper presents a number of existing and novel optimizations for {P2P} search based on distributed hash tables, estimates their effects on performance, and concludes that in combination these optimizations would bring the problem to within an order of magnitude of feasibility. The paper suggests a number of compromises that might achieve the last order of magnitude.}, doi = {10.1007/b11823}, url = {http://www.springerlink.com/content/drdkdgvlgnw0ljkh} } @MISC{Lohmann06IAAL_WhatPeer-to-Peer, author = {Fred von Lohmann}, title = {{IAAL*:} What peer-to-peer developers need to know about copyright law}, howpublished = {http://www.eff.org/wp/iaal-what-peer-peer-developers-need-know-about-copyright-law}, month = {Jan}, year = {2006}, note = {Electronic Frontier Foundation}, abstract = {Intro: The future of peer-to-peer file-sharing is entwined, for better or worse, with copyright law. Copyright owners have already targeted not only the makers of file-sharing clients like Napster, Scour, Audiogalaxy, Aimster and Kazaa, and Morpheus, but also companies that provide products that rely on or add value to public P2P networks, such as MP3Board.com, which provided a web-based search interface for the gnutella network. The U.S. Supreme Court in MGM v. Grokster addressed some, but by no means all, of the copyright law issues that may confront P2P developers and other technologists in the future. If these courtroom skirmishes yield any lesson for P2P developers, it is that a legal strategy needs to be in place early, preferably at the beginning of development, rather than bolted on at the end.}, url = {http://www.eff.org/wp/iaal-what-peer-peer-developers-need-know-about-copyright-law} } @INPROCEEDINGS{Muthitacharoen01low-bandwidthnetworkfile, author = {Athicha Muthitacharoen and Benjie Chen and David Mazi\`{e}res}, title = {A low-bandwidth network file system}, booktitle = {SOSP '01: Proc. 18th ACM symposium on Operating systems principles}, year = {2001}, pages = {174--187}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {Users rarely consider running network file systems over slow or wide-area networks, as the performance would be unacceptable and the bandwidth consumption too high. Nonetheless, efficient remote file access would often be desirable over such networks---particularly when high latency makes remote login sessions unresponsive. Rather than run interactive programs such as editors remotely, users could run the programs locally and manipulate remote files through the file system. To do so, however, would require a network file system that consumes less bandwidth than most current file systems.This paper presents LBFS, a network file system designed for low-bandwidth networks. LBFS exploits similarities between files or versions of the same file to save bandwidth. It avoids sending data over the network when the same data can already be found in the server's file system or the client's cache. Using this technique in conjunction with conventional compression and caching, LBFS consumes over an order of magnitude less bandwidth than traditional network file systems on common workloads.}, doi = {10.1145/502034.502052}, isbn = {1-58113-389-8}, location = {Banff, Alberta, Canada}, url = {http://portal.acm.org/citation.cfm?id=502052} } @INPROCEEDINGS{Quinlan02AwardedBestPaper!, author = {Sean Quinlan and Sean Dorward}, title = {Venti: a new approach to archival data storage}, booktitle = {FAST '02: Proc. 1st USENIX conference on File and Storage Technologies}, year = {2002}, pages = {7}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {This paper describes a network storage system, called Venti, intended for archival data. In this system, a unique hash of a block's contents acts as the block identifier for read and write operations. This approach enforces a write-once policy, preventing accidental or malicious destruction of data. In addition, duplicate copies of a block can be coalesced, reducing the consumption of storage and simplifying the implementation of clients. Venti is a building block for constructing a variety of storage applications such as logical backup, physical backup, and snapshot file systems. We have built a prototype of the system and present some preliminary performance results. The system uses magnetic disks as the storage technology, resulting in an access time for archival data that is comparable to non-archival data. The feasibility of the write-once model for storage is demonstrated using data from over a decadeÕs use of two Plan 9 file systems.}, location = {Monterey, CA}, url = {http://www.usenix.org/publications/library/proceedings/fast02/quinlan.html} } @INPROCEEDINGS{Rhea03Pond_OceanStoreprototype, author = {Sean Rhea and Patrick Eaton and Dennis Geels and Hakim Weatherspoon and Ben Zhao and John Kubiatowicz}, title = {Pond: the {OceanStore} prototype}, booktitle = {FAST '03: Proc. 2nd USENIX conference on File and Storage Technologies}, year = {2003}, pages = {1--14}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {OceanStore is an Internet-scale, persistent data store designed for incremental scalability, secure sharing, and long-term durability. Pond is the OceanStore prototype; it contains many of the features of a complete system including location-independent routing, Byzantine update commitment, push-based update of cached copies through an overlay multicast network, and continuous archiving to erasure-coded form. In the wide area, Pond outperforms NFS by up to a factor of 4.6 on read-intensive phases of the Andrew benchmark, but underperforms NFS by as much as a factor of 7.3 on write-intensive phases. Microbenchmarks show that write performance is limited by the speed of erasure coding and threshold signature generation, two important areas of future research. Further microbenchmarks show that Pond manages replica consistency in a bandwidth-efficient manner and quantify the latency cost imposed by this bandwidth savings.}, location = {San Francisco, CA}, url = {http://www.usenix.org/events/fast03/tech/rhea.html} } @INPROCEEDINGS{Rowstron01Pastry_ScalableDecentralized, author = {Antony Rowstron and Peter Druschel}, title = {Pastry: Scalable, decentralized object location, and routing for {Large-Scale} {Peer-to-Peer} systems}, booktitle = {Middleware '01: Proc. IFIP/ACM international conference on Distributed Systems Platforms}, year = {2001}, series = {Lecture Notes in Computer Science}, pages = {329--350}, address = {Heidelberg, Germany}, publisher = {Springer Berlin / Heidelberg}, abstract = {This paper presents the design and evaluation of Pastry, a scalable, distributed object location and routing substrate for wide-area peer-to-peer applications. Pastry performs application-level routing and object location in a potentially very large overlay network of nodes connected via the Internet. It can be used to support a variety of peer-to-peer applications, including global data storage, data sharing, group communication and naming. Each node in the Pastry network has a unique identifier {(nodeId).} When presented with a message and a key, a Pastry node efficiently routes the message to the node with a {nodeId} that is numerically closest to the key, among all currently live Pastry nodes. Each Pastry node keeps track of its immediate neighbors in the {nodeId} space, and notifies applications of new node arrivals, node failures and recoveries. Pastry takes into account network locality; it seeks to minimize the distance messages travel, according to a to scalar proximity metric like the number of {IP} routing hops Pastry is completely decentralized, scalable, and self-organizing; it automatically adapts to the arrival, departure and failure of nodes. Experimental results obtained with a prototype implementation on an emulated network of up to 100,000 nodes confirm Pastry's scalability and efficiency, its ability to self-organize and adapt to node failures, and its good network locality properties}, doi = {10.1007/3-540-45518-3_18}, keywords = {dht, p2p}, url = {http://www.springerlink.com/content/404522p56nm85503/} } @INPROCEEDINGS{Thekkath97Frangipani_scalabledistributed, author = {Chandramohan A. Thekkath and Timothy Mann and Edward K. Lee}, title = {Frangipani: a scalable distributed file system}, booktitle = {SOSP '97: Proc. 16th ACM symposium on Operating Systems Principles}, year = {1997}, pages = {224--237}, address = {New York, NY, USA}, publisher = {ACM}, abstract = {The ideal distributed file system wouldprovide all its users with coherent, shared access to the same set of files,yet would be arbitrarily scalable to provide more storage space and higher performance to a growing user community. It would be highly available in spite of component failures. It would require minimal human administration, and administration would not become more complex as more components were added. Frangipani is a new file system that approximates this ideal, yet was relatively easy to build because of its two-layer structure. The lower layer is Petal (described in an earlier paper), a distributed storage service that provides incrementally scalable, highly available, automatically managed virtual disks. In the upper layer, multiple machines run the same Frangipani file system code on top of a shared Petal virtual disk, using a distributed lock service to ensure coherence. Frangipani is meant to run in a cluster of machines that are under a common administration and can communicate securely. Thus the machines trust one another and the shared virtual disk approach is practical. Of course, a Frangipani file system can be exported to untrusted machines using ordinary network file access protocols. We have implemented Frangipani on a collection of Alphas running DIGITAL Unix 4.0. Initial measurements indicate that Frangipani has excellent single-server performance and scales well as servers are added.}, doi = {10.1145/268998.266694}, isbn = {0-89791-916-5}, location = {Saint Malo, France}, url = {http://portal.acm.org/citation.cfm?doid=268998.266694} } @INPROCEEDINGS{Waldman00Publius_robusttamper-evident, author = {Marc Waldman and Aviel D. Rubin and Lorrie Faith Cranor}, title = {Publius: A robust, tamper-evident, censorship-resistant web publishing system}, booktitle = {SSYM '00: Proc. 9th USENIX Security Symposium}, year = {2000}, pages = {59--72}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {We describe a system that we have designed and implemented for publishing content on the web. Our publishing scheme has the property that it is very difficult for any adversary to censor or modify the content. In addition, the identity of the publisher is protected once the content is posted. Our system differs from others in that we provide tools for updating or deleting the published content, and users can browse the content in the normal point and click manner using a standard web browser and a client-side proxy that we provide. All of our code is freely available.}, location = {Denver, Colorado}, url = {http://www.cs.nyu.edu/~waldman/publius/paper.html} } @INPROCEEDINGS{Walfish06Distributedquotaenforcement, author = {Michael Walfish and J.D. Zamfirescu and Hari Balakrishnan and David Karger}, title = {Distributed quota enforcement for spam control}, booktitle = {NSDI '06: Proc. 3rd USENIX symposium on Networked Systems Design \& Implementation}, year = {2006}, pages = {281--296}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {Spam, by overwhelming inboxes, has made email a less reliable medium than it was just a few years ago. Spam filters are undeniably useful but unfortunately can flag non-spam as spam. To restore email's reliability, a recent spam control approach grants quotas of stamps to senders and has the receiver communicate with a well-known quota enforcer to verify that the stamp on the email is fresh and to cancel the stamp to prevent reuse. The literature has several proposals based on this general idea but no complete system design and implementation that: scales to today's email load (which requires the enforcer to be distributed over many hosts and to tolerate faults in them), imposes minimal trust assumptions, resists attack, and upholds today's email privacy. This paper describes the design, implementation, analysis, and experimental evaluation of DQE, a spam control system that meets these challenges. DQE's enforcer occupies a point in the design spectrum notable for simplicity: mutually untrusting nodes implement a storage abstraction but avoid neighbor maintenance, replica maintenance, and heavyweight cryptography.}, location = {San Jose, CA}, url = {http://www.usenix.org/events/nsdi06/tech/walfish.html} } @INPROCEEDINGS{Walsh06ExperiencewithObject, author = {Kevin Walsh and Emin G{\"u}n Sirer}, title = {Experience with an object reputation system for peer-to-peer filesharing}, booktitle = {NSDI '06: Proc. 3rd USENIX symposium on Networked Systems Design \& Implementation}, year = {2006}, pages = {1--1}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {In this paper, we describe Credence, a decentralized object reputation and ranking system for large-scale peer-to-peer filesharing networks. Credence counteracts pollution in these networks by allowing honest peers to assess the authenticity of online content through secure tabulation and management of endorsements from other peers. Our system enables peers to learn relationships even in the absence of direct observations or interactions through a novel, flow-based trust computation to discover trustworthy peers. We have deployed Credence as an overlay on top of the Gnutella filesharing network, with more than 10,000 downloads of our client software to date. We describe the system design, our experience with its deployment, and results from a long-term study of the trust network built by users. Data from the live deployment shows that Credence's flow-based trust computation enables users to avoid undesirable content. Honest Credence clients can identify three quarters of the decoys encountered when querying the Gnutella network.}, location = {San Jose, CA}, url = {http://www.usenix.org/events/nsdi06/tech/walsh.html} } @INPROCEEDINGS{Yu06Availabilityofmulti-object, author = {Haifeng Yu and Phillip~B. Gibbons and Suman Nath}, title = {Availability of multi-object operations}, booktitle = {NSDI '06: Proc. 3rd USENIX symposium on Networked Systems Design \& Implementation}, year = {2006}, pages = {16--16}, address = {Berkeley, CA, USA}, publisher = {USENIX Association}, abstract = {Highly-available distributed storage systems are commonly designed to optimize the availability of individual data objects, despite the fact that user level tasks typically request multiple objects. In this paper, we show that the assignment of object replicas (or fragments, in the case of erasure coding) to machines plays a dramatic role in the availability of such multi-object operations, without affecting the availability of individual objects. For example, for the TPC-H benchmark under real-world failures, we observe differences of up to four nines between popular assignments used in existing systems. Experiments using our wide-area storage system prototype, MOAT, on the PlanetLab, as well as extensive simulations, show which assignments lead to the highest availability for a given setting.}, location = {San Jose, CA}, url = {http://www.usenix.org/events/nsdi06/tech/yu.html} }