farewell to disks: efficient processing of obstinate data
DESCRIPTION
Professor Diomidis Spinellis gave a lecture on Farewell to Disks: Efficient Processing of Obstinate Data in the Distinguished Lecturer Series - Leon The Mathematician.TRANSCRIPT
1
1
Αποχαιρετισμός στους Δίσκους:
Αποδοτική Επεξεργασία
Περίπλοκων Δεδομένων
Διομήδης Σπινέλλης
Καθηγητής
Τμήμα Διοικητικής Επιστήμης και Τεχνολογίας
Οικονομικό Πανεπιστήμιο Αθηνών
http://www.dmst.aueb.gr/dds
2
3
4
5
6
1
7
8
9
10
2
11
12
3
13
14
L1 D cache
1.3 ns
L2 cache
9.7 ns
DDR RAM
28.5 ns
Hard disk
25.6 ms
Wors
t case late
ncy (
Log
scale
)
15
16
17
18
1
19
Function call
1.3ns
System call
1.9μs
Local IPC
4.3μs
Remote IPC
1.2ms
Tim
e (
Lo
g s
ca
le)
select Locations.cc1, Divisions.name, avg(CO2), count(*), Locations.lat, Locations.long, POPDENSITY.DENSITY from Papers inner join Locations on Papers.confLocId = Locations.id inner join Divisions on Locations.cc1 = Divisions.country inner join POPDENSITY on Divisions.name = upper(POPDENSITY.name) where Divisions.code = '00' and CO2 notnull group by Locations.cc1 having count(*) > 20 order by avg(CO2) desc;
20
/* Get the data */ if (mcSet.dataLen) { data = xmalloc(mcSet.dataLen); if (lseek(fd, mcSet.data.off, SEEK_SET) == -1) CORRUPT(); if (read(fd, data, mcSet.dataLen) != mcSet.dataLen) CORRUPT(); if (lseek(fd, mcSet.u.firstMsg, SEEK_SET) == -1) CORRUPT(); for (i = 0; i < mcSet.numMsgs; ++i) { if (read(fd, &mcMsg, sizeof(mcMsg)) != sizeof(mcMsg)) CORRUPT(); if (mcMsg.invalid) { --i; continue; } msg = xmalloc(sizeof(msgT)); memset(msg, '\0', sizeof(*msg)); /* […] */ msg->msgId = mcMsg.msgId; msg->str = xstrdup((char *) (data + mcMsg.msg.off)); } free(data); }
2
21
MMAP(2) FreeBSD System Calls Manual MMAP(2) NAME mmap -- allocate memory, or map files or devices into memory SYNOPSIS #include <sys/mman.h> void * mmap(void *addr, size_t len, int prot, int flags, int fd, off_t offset); DESCRIPTION The mmap() system call causes the pages starting at addr and continuing for at most len bytes to be mapped from the object described by fd, starting at byte offset offset.
22
[dds@istlab /usr/src/sys/vm]$ ls default_pager.c uma_int.h vm_page.c device_pager.c vm.h vm_page.h memguard.c vm_contig.c vm_pageout.c memguard.h vm_extern.h vm_pageout.h phys_pager.c vm_fault.c vm_pager.c pmap.h vm_glue.c vm_pager.h redzone.c vm_init.c vm_param.h redzone.h vm_kern.c vm_phys.c sg_pager.c vm_kern.h vm_phys.h swap_pager.c vm_map.c vm_reserv.c swap_pager.h vm_map.h vm_reserv.h uma.h vm_meter.c vm_unix.c uma_core.c vm_mmap.c vm_zeroidle.c uma_dbg.c vm_object.c vnode_pager.c uma_dbg.h vm_object.h vnode_pager.h
23
3
24
$ ls -lh sparse -rw-r--r-- 1 dds dds 500G Mar 19 20:32 sparse $ du -h sparse 28K sparse
4 διεργασία 1 διεργασία 2
φυσική μνήμη
r/o r/o
25
r/w r/w
διεργασία 1 διεργασία 2
φυσική μνήμη
read read
διεργασία 1 διεργασία 2
φυσική μνήμη
read r/w
αντίγραφο
διεργασία 1 διεργασία 2
φυσική μνήμη
26
5
C++
28
01110010011 0111101101101011 0000101101110011 00101 // romane 01110010011 0111101101101011 0000101101110011 1010101110011 // romanus 01110010011 0111101101101011 10101011011000111010101110011 // romulus 01110010011 10101011000100110 0101011 0111001110011 // rubens 01110010011 10101011000100110 0101011 10010 // ruber 01110010011 10101011000100110 100101100011011 0111101101110 // rubicon 01110010011 10101011000100110 100101100011011 1010101101110011001000111 010101110011 // rubicundus
Κατασκευή
δομής
δίσκου
Δομή δίσκου
Κατάλογος
άρθρων
Δομή
μνήμης
Κατασκευή
δένδρου
ριζών
Αρχική
ιστοσελίδα
Ιστοσελίδα
με νέους
δεσμούς
wikipedialize
for (;;) { i = bitpos; // Loop until the end of the current node or the end of the word while (i < p->end && i < len * 8) { // Covering whole byte? if (i % 8 == 0 && i + 8 <= p->end && (i + 8) / 8 <= len && data[i / 8] == p->data[i / 8]) { i += 8; continue; } // Split point if (getbit(data, i) != getbit(p->data, i)) { // Node with the new data struct pnode *n = new_node(data + i / 8, i % 8, (len - i / 8) * 8, NULL, NULL, true); // Tail of the current node struct pnode *t = new_node(p->data + i / 8, i % 8, p->end - (i & ~7), p->zero, p->one, p->is_terminal); // Head of current node if (getbit(data, i)) *p2 = new_node(p->data, bitpos, i, t, n, false); else *p2 = new_node(p->data, bitpos, i, n, t, false); free(p); return; } i++; } // while
29
// Write the given node to the specified file, returning its file offset. // On return the file's offset is set to the first free byte. static long write_node(struct pnode *p, FILE *f) { long my_offset = ftell(f); size_t ret; if (p->one) { struct pnode_disk_one pdo; size_t dlen = datalen(p->end); long len = sizeof(pdo) + dlen; fseek(f, len, SEEK_CUR); pdo.h.type = dt_one; pdo.h.is_terminal = p->is_terminal; pdo.h.has_zero = (p->zero != NULL); pdo.h.has_one = true; pdo.h.begin = p->begin; pdo.h.end = p->end; if (p->zero) write_node(p->zero, f); pdo.one = write_node(p->one, f); long saved_offset = ftell(f); fseek(f, my_offset, SEEK_SET); fwrite(&pdo, 1, sizeof(pdo), f); fwrite(p->data, 1, dlen, f); fseek(f, saved_offset, SEEK_SET); return my_offset; } else {
$ zcat enwiki-latest-all-titles-in-ns0.gz | wc -c 106,237,053 $ wc -c enwiki.pt 144,657,286 enwiki.pt
30
$ curl http://www.kiosek.com/dostoevsky/library/crimeandpunishment.txt | perl -pe 's/[\r\n]/ /g' >crimeandpunishment.txt
31
$ wc crimeandpunishment.txt 0 203,273 1,462,661 crimeandpunishment.txt
$ time ./wpltest en en_US.UTF-8 ISO-8859-1 data/enwiki.pt <crimeandpunishment.txt >/dev/null
$ time ./wpltest en en_US.UTF-8 ISO-8859-1 data/enwiki.pt <crimeandpunishment.txt >/dev/null Checked 406,225 prefixes real 0m5.859s # Cold cache real 0m1.876s # Warm cache user 0m1.780s sys 0m0.090s
32
// Prevent memory alignment problems
memcpy(&end, &(p.h->end), sizeof(end)); while (i < end && i < len * 8) { if (i % 8 == 0 && i + 8 <= end && data[i / 8] == pdata[i / 8]) {
i += 8; prefix += 8;
continue; }
// Split point if (getbit(data, i) != getbit(pdata, i))
return best / 8; i++;
prefix++; }
if (i == end && p.h->is_terminal) best = prefix;
if (i == len * 8) return best / 8; // Move to next node
bitpos = end % 8; int covered = end / 8;
if (getbit(data, end)) { if (!p.h->has_one)
return best / 8; switch (p.h->type) {
case dt_both: p.h = (struct pnode_disk_head *)(base + p.b->one);
break; case dt_one:
p.h = (struct pnode_disk_head *)(base + p.o->one); break;
case dt_short: default: assert(0);
} } else {
if (!p.h->has_zero) return best / 8;
switch (p.h->type) { case dt_both:
p.h = (struct pnode_disk_head *)(base + p.b->zero); break;
case dt_one: // Advance to the end of this node
p.h = (struct pnode_disk_head *)((char *)p.h + sizeof(struct pnode_disk_one) + datalen(end)); break;
case dt_short: // Advance to the end of this node p.h = (struct pnode_disk_head *)((char *)p.h + sizeof(struct pnode_disk_short) + datalen(end));
break; default:
assert(0); }
}
/*
* You are not expected
* to understand this
*/
33
e.g. 2
The problem with wikipedia
34
Λίστα ακμών Δομή
δεδομένων
γράφου
Κατασκευή
γράφου
Κορυφές
αρχής, τέλους Διαδρομή
BFS
Δομή
δεδομένων
γράφου
Λίστα ακμών
Κατασκευή
γράφου
Κορυφές
αρχής, τέλους Διαδρομή
BFS
Δομή
δεδομένων
γράφου
35
Λίστα ακμών Δομή
δεδομένων
γράφου
Κατασκευή
γράφου
// Loop through all lines, // adding them to the graph while (std::getline(in, line)) { int split = line.find('\001'); if (split == std::string::npos) { std::cerr << "No separator: " << line << std::endl; continue; } n.setName(line.substr(0, split)); NodesIter from(entries->insert(n).first); n.setName(line.substr(split + 1)); NodesIter to(entries->insert(n).first); (const_cast<Node &>(*from)).addEdge( const_cast<Node *>(&*to)); }
Λίστα ακμών
Κατασκευή
γράφου
Κορυφές
αρχής, τέλους Διαδρομή
BFS
Δομή
δεδομένων
γράφου
36
Διαδρομή
BFS
Δομή
δεδομένων
γράφου
Tacoma Narrows Bridge
p=
Suspension bridge
p=
Washington
p=
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]=
Tacoma Narrows Bridge
p=
Suspension bridge
p=
Washington
p=
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]=Tacoma Narrow Bridge
37
Tacoma Narrows Bridge
p=
Suspension bridge
p=
Washington
p=
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]=Tacoma Narrow Bridge
Tacoma Narrows Bridge
p=
Suspension bridge
p=
Washington
p=
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]=
Tacoma Narrows Bridge
p=
Suspension bridge
p=Tacoma Narrows Bridge
Washington
p=
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]= Suspension bridge
38
Suspension bridge
p=Tacoma Narrows Bridge
Washington
p=Tacoma Narrows Bridge
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]= Suspension bridge
Washington
Tacoma Narrows Bridge
p=
Tacoma Narrows Bridge
p=
Suspension bridge
p=Tacoma Narrows Bridge
Washington
p=Tacoma Narrows Bridge
Geneva
p=
William Howard Taft
p=
Montana
p=
Ουρά
[]= Suspension bridge
Washington
Suspension bridge
p=Tacoma Narrows Bridge
Washington
p=Tacoma Narrows Bridge
Geneva
p=Suspension bridge
William Howard Taft
p=
Montana
p=
Ουρά
[]= Washington
Geneva
Tacoma Narrows Bridge
p=
39
Washington
p=Tacoma Narrows Bridge
Geneva
p=Suspension bridge
William Howard Taft
p=
Montana
p=Washington
Ουρά
[]= Geneva
Montana
Tacoma Narrows Bridge
p=
Suspension bridge
p=Tacoma Narrows Bridge
Washington
p=Tacoma Narrows Bridge
Geneva
p=Suspension bridge
William Howard Taft
p=
Montana
p=Washington
Ουρά
[]=Montana
Tacoma Narrows Bridge
p=
Suspension bridge
p=Tacoma Narrows Bridge
Washington
p=Tacoma Narrows Bridge
Geneva
p=Suspension bridge
William Howard Taft
p=Geneva
Montana
p=Washington
Ουρά
[]=Montana
Tacoma Narrows Bridge
p=
Suspension bridge
p=Tacoma Narrows Bridge
40
static bool breadthFirstSearchFor(NodePtr from, NodePtr to,
size_t n) {
std::queue<NodePtr> q;
from->setColor(Node::Gray);
q.push(from);
while (!q.empty()) {
NodePtr u = q.front();
q.pop();
const Edges edges = u->getEdges();
for (Edges::const_iterator j = edges.begin();
j != edges.end(); j++)
if ((*j)->getColor() == Node::White) {
(*j)->setColor(Node::Gray);
(*j)->setPredecessor(u);
if (*j == to)
return true; // Found
q.push(*j);
}
u->setColor(Node::Black);
}
return false; // Not found
}
Λίστα ακμών
Κατασκευή
γράφου
Κορυφές
αρχής, τέλους Διαδρομή
BFS
Δομή
δεδομένων
γράφου
Δομή
δεδομένων
γράφου
41
42
Δομή
δεδομένων
γράφου
#include <string> #include <iostream> #include <queue> #include <list> #include <functional> #include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/offset_ptr.hpp> #include <boost/interprocess/allocators/allocator.hpp> #include <boost/unordered_set.hpp> #include <boost/interprocess/containers/string.hpp> #include <boost/interprocess/containers/slist.hpp> #include <boost/filesystem.hpp> #include <boost/filesystem/operations.hpp>
43
#include <string> #include <iostream> #include <queue> #include <list> #include <functional> #include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/offset_ptr.hpp> #include <boost/interprocess/allocators/allocator.hpp> #include <boost/unordered_set.hpp> #include <boost/interprocess/containers/string.hpp> #include <boost/interprocess/containers/slist.hpp> #include <boost/filesystem.hpp> #include <boost/filesystem/operations.hpp>
typedef managed_mapped_file::segment_manager SegmentManager; typedef allocator<char, SegmentManager> CharAllocator; typedef basic_string<char, std::char_traits<char>, CharAllocator> CharString; typedef allocator<Node, SegmentManager> NodeAllocator; typedef boost::unordered_set<Node, boost::hash<Node>, NodeEqual, NodeAllocator> Nodes; typedef offset_ptr<Node> NodePtr; typedef allocator<NodePtr, SegmentManager> NodePtrAllocator; typedef slist<NodePtr, NodePtrAllocator> Edges; typedef allocator<void, SegmentManager> VoidAllocator; typedef allocator<Edges, SegmentManager> EdgesAllocator;
// A graph node, suitable for performing a breadh-first search class Node { public: typedef enum {White, Gray, Black} Color; private: CharString name; // Node name Color color; // Color used during BFS NodePtr predecessor; // BFS predecessor Edges edges; // Node's edges public: // Since VoidAllocator is convertible to any other // allocator<T>, we can simplify the initialization // taking just one allocator for all inner containers. Node(const std::string &n, const VoidAllocator &voidAlloc) : name(n.begin(), n.end(), voidAlloc), color(White), predecessor(NULL), edges(voidAlloc) {} void addEdge(NodePtr p) { edges.push_front(p); } };
44
/* * Read ^A-separated nodes from the inputFile, storing the graph * structure in the specified backingFile. */ static void readData(const char *backingFile, const char *inputFile) { std::ifstream in(inputFile, std::ios::binary); if (in.fail()) { perror(inputFile); exit(1); } boost::filesystem::remove_all(backingFile); managed_mapped_file segment(create_only, backingFile, FileSize); // An allocator convertible to any allocator<T, SegmentManager> type VoidAllocator allocInst (segment.get_segment_manager()); // Construct the memory map and fill it Nodes *entries = segment.construct<Nodes>("entries")(Elements, boost::hash<Node>(), NodeEqual(), allocInst); std::string line; Node n(std::string(), allocInst); // To save construction costs
/* * Search and report the shortest graph path from "from" to "to" * The graph is stored in backingFile. */ static void searchData(const char *backingFile, const std::string &from, const std::string &to) { managed_mapped_file segment(open_copy_on_write, backingFile); // An allocator convertible to any allocator<T, SegmentManager> VoidAllocator allocInst(segment.get_segment_manager()); // Obtain the previously saved entries Nodes *entries = segment.find<Nodes>("entries").first; NodePtr toPtr; bool found = breadthFirstSearchFor( findNode(entries, Node(from, allocInst)), toPtr = findNode(entries, Node(to, allocInst)), entries->size());
45
Λίστα ακμών
Κατασκευή
γράφου
Κορυφές
αρχής, τέλους Διαδρομή
BFS
Δομή
δεδομένων
γράφου
46
$ ./smap -r graph.bin graph.txt
$ ./smap -s graph.bin 'Tacoma Narrows Bridge'\ 'William howard taft' 0% 10 20 30 40 50 60 70 80 90 100% |----|----|----|----|----|----|----|----|----|----| *
Tacoma Narrows Bridge Washington Montana William howard taft
$ ./smap -s graph.bin 'Tacoma Narrows Bridge'\ '24-hour analog dial' 0% 10 20 30 40 50 60 70 80 90 100% |----|----|----|----|----|----|----|----|----|----| **
Tacoma Narrows Bridge Suspension bridge Geneva Watch 24-hour analog dial
47
$ ./smap -s graph.bin 'Tacoma Narrows Bridge' 'Wet t-shirt contest' 0% 10 20 30 40 50 60 70 80 90 100% |----|----|----|----|----|----|----|----|----|----| *
Tacoma Narrows Bridge Washington Starbucks Toplessness Wet t-shirt contest
The problem with wikipedia
48
Performance
MySQL mmap
Server 15:59:43
Client system 03:16:59 00:04:32
Client user 00:52:48 00:04:52
00:00
06:00
12:00
18:00
Χρ
όνο
ς(ω
:λ)
Κατασκευή δομής δεδομένων
MySQL mmap
Waiting 348 3.886
Server 259
Client system 58 19
Client user 16 2
0500
1.0001.5002.0002.5003.0003.5004.0004.500
Χρ
όνο
ς /
κό
μβ
ο (
μs)
Taft: Κρύα κρυφή μνήμη
49
MySQL mmap
Waiting 23 0
Server 305
Client system 59 5
Client user 15 3
050
100150200250300350400450
Χρ
όνο
ς/
κό
μβ
ο (
μs)
Taft: Ζεστή κρυφή μνήμη
MySQL mmap
Waiting 415 1.977
Server 472
Client system 103 10
Client user 26 4
0
500
1.000
1.500
2.000
2.500
Χρ
όνο
ς /
κό
μβ
ο (
μs)
24h Clock: Κρύα κρυφή μνήμη
MySQL mmap
Waiting 120 0
Server 469
Client system 103 3
Client user 27 4
0100200300400500600700800
Χρ
όνο
ς /
κό
μβ
ο (
μs)
24h Clock: Ζεστή κρυφή μνήμη
50
0
1
2
3
4
5
0 2000 4000 6000 8000
Χρ
όνο
ς(ρ
) /
κό
μβ
ο(m
s)
Αριθμός κόμβων
Χιλιάδες
Κλιμάκωση απόδοσης (κρύα μνήμη)
mmap
MySQL
51
ACID
A
52
C
I
D
53
SQL
54
A case…
Application code
vector<Customer> customers1;
Customer c1(d1,cd1,s1,p1);
customers1.push_back(c1);
…
vector<Truck> trucks;
Truck t1(cs1,dc1,pc1,rlp1, customers1);
trucks.push_back(t1);
….
ODBC
JDBC
55
register
L1 D cache
L2 cache
DRAM
HDD cache
HDD / SSD
L3 cache
56
534,681,000 εντολές ΚΜΕ
1
10
100
1,000
10,000
100,000
L1 D cache L2 cache DDR RAM Hard disk
Μέγ
ιστη
διε
κπ
ερα
ιωτι
κότη
τα
(MB
/s )
L1 D cache1.3 ns
L2 cache9.7 ns
DDR RAM28.5 ns
Hard disk25.6 ms
Χεί
ρισ
τη α
να
μονή (
λογ. κλιμ
.)
57
L1 D cache
1.3 ns
L2 cache
9.7 ns
DDR RAM
28.5 ns
Hard disk
25.6 ms
Χεί
ρισ
τη α
να
μονή (
λογ. κλιμ
.)
58
// Write the given node to the specified file, returning its file offset. // On return the file's offset is set to the first free byte. static long write_node(struct pnode *p, FILE *f) { long my_offset = ftell(f); size_t ret; if (p->one) { struct pnode_disk_one pdo; size_t dlen = datalen(p->end); long len = sizeof(pdo) + dlen; fseek(f, len, SEEK_CUR); pdo.h.type = dt_one; pdo.h.is_terminal = p->is_terminal; pdo.h.has_zero = (p->zero != NULL); pdo.h.has_one = true; pdo.h.begin = p->begin; pdo.h.end = p->end; if (p->zero) write_node(p->zero, f); pdo.one = write_node(p->one, f); long saved_offset = ftell(f); fseek(f, my_offset, SEEK_SET); fwrite(&pdo, 1, sizeof(pdo), f); fwrite(p->data, 1, dlen, f); fseek(f, saved_offset, SEEK_SET); return my_offset; } else {
59
#include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/offset_ptr.hpp> #include <boost/interprocess/allocators/allocator.hpp> #include <boost/unordered_set.hpp> #include <boost/interprocess/containers/string.hpp> #include <boost/interprocess/containers/slist.hpp>
w r/ο
βήμα 1 βήμα Ν
φυσική μνήμη
read r/w
αντίγραφο
διεργασία 1 διεργασία 2
φυσική μνήμη
61
www.spinellis.gr/wpl
www.spinellis.gr/blog/20101030/smap.cpp