Example of Data Extraction

[1]:
# import openLA as la
import OpenLA as la
[2]:
course_info, event_stream = la.start_analysis(files_dir="dataset_sample", course_id="A")

Extract data about selected user

[3]:
users = course_info.user_id()
user_stream = la.select_user(event_stream, users[0])
[4]:
users[0]
[4]:
'A_U1'
[5]:
user_stream.df  # event stream related to user "A_U1"
[5]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
0 A_U1 C1 PREV 10 NaN 0 tablet 2018-04-09 10:57:15
1 A_U1 C1 PREV 9 NaN 0 tablet 2018-04-09 11:00:59
2 A_U1 C1 PREV 8 NaN 0 tablet 2018-04-09 11:03:31
3 A_U1 C1 PREV 30 NaN 0 tablet 2018-04-10 10:14:12
4 A_U1 C1 PREV 29 NaN 0 tablet 2018-04-10 10:27:24
... ... ... ... ... ... ... ... ...
1464 A_U1 C8 NEXT 2 NaN 0 tablet 2018-04-24 15:27:39
1465 A_U1 C8 NEXT 3 NaN 0 tablet 2018-04-24 15:27:57
1466 A_U1 C8 NEXT 4 NaN 0 tablet 2018-04-24 15:28:07
1467 A_U1 C8 NEXT 5 NaN 0 tablet 2018-05-29 15:42:36
1468 A_U1 C8 NEXT 6 NaN 0 tablet 2018-05-29 15:43:16

1469 rows × 8 columns

Extract data about selected contents

[6]:
contents = course_info.contents_id()
content_stream = la.select_contents(event_stream, contents[0])
[7]:
contents[0]
[7]:
'C1'
[8]:
content_stream.df  # event stream related to content "C1"
[8]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
0 A_U1 C1 PREV 10 NaN 0 tablet 2018-04-09 10:57:15
1 A_U1 C1 PREV 9 NaN 0 tablet 2018-04-09 11:00:59
2 A_U1 C1 PREV 8 NaN 0 tablet 2018-04-09 11:03:31
3 A_U1 C1 PREV 30 NaN 0 tablet 2018-04-10 10:14:12
4 A_U1 C1 PREV 29 NaN 0 tablet 2018-04-10 10:27:24
... ... ... ... ... ... ... ... ...
262022 A_U99 C1 NEXT 15 NaN 0 pc 2018-06-05 16:07:52
262023 A_U99 C1 ADD MARKER 64 important 0 pc 2018-06-05 16:08:15
262024 A_U99 C1 NEXT 9 NaN 0 pc 2018-06-05 16:13:05
262025 A_U99 C1 ADD BOOKMARK 23 NaN 0 pc 2018-06-05 16:29:09
262026 A_U99 C1 NEXT 23 NaN 0 pc 2018-06-05 16:29:52

53992 rows × 8 columns

‘select_user()’ and ‘select_contents()’ are used for not only EventStream class but also converted classes.

[9]:
page_transition = la.convert_into_page_wise(event_stream, invalid_seconds=3)
user_page_transition = la.select_user(page_transition, users[0:2])
contents_page_transition = la.select_contents(user_page_transition, contents[0:2])
[10]:
users[0:2]
[10]:
['A_U1', 'A_U10']
[11]:
contents[0:2]
[11]:
['C1', 'C2']
[12]:
contents_page_transition.df # page transition data related to users "A_U1" and "A_u10", and contents "C1" and "C2"
[12]:
userid contentsid pageno num_visits average_reading_seconds reading_seconds PREV NEXT CLOSE PAGE_JUMP ... ADD BOOKMARK ADD MARKER DELETE MARKER DELETE BOOKMARK ADD MEMO DELETE_MEMO CHANGE MEMO SEARCH SEARCH_JUMP LINK_CLICK
0 A_U1 C1 1 13 6686.230769 86921 0 7 0 10 ... 0 0 0 0 0 0 0 0 0 0
1 A_U1 C1 2 9 214.222222 1928 0 6 1 0 ... 0 2 1 0 0 0 0 0 0 0
2 A_U1 C1 3 7 68.142857 477 0 7 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 A_U1 C1 4 6 74.833333 449 1 5 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 A_U1 C1 5 6 87.833333 527 0 6 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
442 A_U10 C2 58 1 179.000000 179 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
443 A_U10 C2 59 1 72.000000 72 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
444 A_U10 C2 60 1 16.000000 16 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
445 A_U10 C2 61 1 94.000000 94 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
446 A_U10 C2 62 2 310.000000 620 1 0 0 1 ... 0 0 0 0 0 0 0 0 0 0

258 rows × 22 columns

Extract data during, before, and after a lecture

During lecture

[29]:
lecture_week=2

event_stream_ = la.select_user(la.select_contents(event_stream, contents[0]), users[0])  # related to user 'A_U1' and content 'C1'

stream_during_lecture = la.select_by_lecture_time(course_info, event_stream_, lecture_week=2, timing="during")

lecture_start = course_info.lecture_start_time(lecture_week)
lecture_end = course_info.lecture_end_time(lecture_week)
[30]:
lecture_start
[30]:
Timestamp('2018-04-17 14:50:00')
[31]:
lecture_end
[31]:
Timestamp('2018-04-17 16:20:00')
[32]:
stream_during_lecture.df  # during the lecture time (between '2018-04-17 14:50:00' and '2018-04-17 16:20:00')
[32]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
47 A_U1 C1 PAGE_JUMP 1 NaN 0 tablet 2018-04-17 14:50:01
48 A_U1 C1 NEXT 1 NaN 0 tablet 2018-04-17 14:51:40
49 A_U1 C1 NEXT 2 NaN 0 tablet 2018-04-17 14:51:44
50 A_U1 C1 NEXT 3 NaN 0 tablet 2018-04-17 14:54:24
51 A_U1 C1 NEXT 4 NaN 0 tablet 2018-04-17 14:54:24
... ... ... ... ... ... ... ... ...
135 A_U1 C1 NEXT 59 NaN 0 tablet 2018-04-17 16:03:39
136 A_U1 C1 NEXT 60 NaN 0 tablet 2018-04-17 16:03:54
137 A_U1 C1 NEXT 61 NaN 0 tablet 2018-04-17 16:04:22
138 A_U1 C1 NEXT 62 NaN 0 tablet 2018-04-17 16:04:37
139 A_U1 C1 NEXT 63 NaN 0 tablet 2018-04-17 16:05:12

93 rows × 8 columns

[33]:
# If you want to include 5 minutes before the begin of lecture and after the end of lecture,
# the arguments 'extension_minutes_before_lecture' and 'extension_minutes_after_lecture' are useful.

stream_during_lecture_add_5minutes = la.select_by_lecture_time(course_info, event_stream_,
                                                              lecture_week=2, timing="during",
                                                              extension_minutes_before_lecture=5,
                                                              extension_minutes_after_lecture=5
                                                              )
[34]:
stream_during_lecture_add_5minutes.df # between '2018-04-17 14:50:00 - 5 minutes' and '2018-04-17 16:20:00 + 5 minutes'
[34]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
46 A_U1 C1 OPEN 1 NaN 0 tablet 2018-04-17 14:49:55
47 A_U1 C1 PAGE_JUMP 1 NaN 0 tablet 2018-04-17 14:50:01
48 A_U1 C1 NEXT 1 NaN 0 tablet 2018-04-17 14:51:40
49 A_U1 C1 NEXT 2 NaN 0 tablet 2018-04-17 14:51:44
50 A_U1 C1 NEXT 3 NaN 0 tablet 2018-04-17 14:54:24
... ... ... ... ... ... ... ... ...
135 A_U1 C1 NEXT 59 NaN 0 tablet 2018-04-17 16:03:39
136 A_U1 C1 NEXT 60 NaN 0 tablet 2018-04-17 16:03:54
137 A_U1 C1 NEXT 61 NaN 0 tablet 2018-04-17 16:04:22
138 A_U1 C1 NEXT 62 NaN 0 tablet 2018-04-17 16:04:37
139 A_U1 C1 NEXT 63 NaN 0 tablet 2018-04-17 16:05:12

94 rows × 8 columns

[35]:
# If you want to omit 10 minutes after the begin of lecture and before the end of lecture,
# negative value can be used for the arguments 'extension_minutes_before_lecture' and 'extension_minutes_after_lecture'

stream_during_lecture_omit_10minutes = la.select_by_lecture_time(course_info, event_stream,
                                                                 lecture_week=2, timing="during",
                                                                 extension_minutes_before_lecture=-10,
                                                                 extension_minutes_after_lecture=-10
                                                                )
[36]:
stream_during_lecture_omit_10minutes.df # between '2018-04-17 14:50:00 + 10 minutes' and '2018-04-17 16:20:00 - 10 minutes'
[36]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
54 A_U1 C1 NEXT 7 NaN 0 tablet 2018-04-17 15:01:43
55 A_U1 C1 NEXT 8 NaN 0 tablet 2018-04-17 15:01:44
56 A_U1 C1 NEXT 9 NaN 0 tablet 2018-04-17 15:01:47
57 A_U1 C1 NEXT 10 NaN 0 tablet 2018-04-17 15:01:56
58 A_U1 C1 NEXT 11 NaN 0 tablet 2018-04-17 15:01:56
... ... ... ... ... ... ... ... ...
263140 A_U99 C8 PREV 61 NaN 0 pc 2018-04-17 15:48:56
263141 A_U99 C8 PREV 60 NaN 0 pc 2018-04-17 15:49:01
263142 A_U99 C8 NEXT 52 NaN 0 pc 2018-04-17 15:50:11
263143 A_U99 C8 NEXT 53 NaN 0 pc 2018-04-17 15:51:26
263144 A_U99 C8 NEXT 54 NaN 0 pc 2018-04-17 15:51:32

24586 rows × 8 columns

Before lecture

[37]:
stream_before_lecture = la.select_by_lecture_time(course_info, event_stream_, lecture_week=lecture_week, timing="before")
[38]:
lecture_start
[38]:
Timestamp('2018-04-17 14:50:00')
[39]:
stream_before_lecture.df  # between the end of lecture 1 and the begin of lecture 2 ('2018-04-10 16:20:00' - '2018-04-17 14:50:00')
[39]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
36 A_U1 C1 CLOSE 53 NaN 0 tablet 2018-04-10 16:21:24
37 A_U1 C1 NEXT 23 NaN 0 tablet 2018-04-10 16:22:21
38 A_U1 C1 NEXT 24 NaN 0 tablet 2018-04-10 16:22:39
39 A_U1 C1 NEXT 25 NaN 0 tablet 2018-04-10 16:22:56
40 A_U1 C1 PAGE_JUMP 1 NaN 0 tablet 2018-04-10 16:51:07
41 A_U1 C1 OPEN 1 NaN 0 tablet 2018-04-10 16:51:28
42 A_U1 C1 NEXT 29 NaN 0 tablet 2018-04-17 00:05:16
43 A_U1 C1 NEXT 30 NaN 0 tablet 2018-04-17 00:05:33
44 A_U1 C1 BOOKMARK_JUMP 53 NaN 0 tablet 2018-04-17 14:00:03
45 A_U1 C1 PAGE_JUMP 1 NaN 0 tablet 2018-04-17 14:00:44
46 A_U1 C1 OPEN 1 NaN 0 tablet 2018-04-17 14:49:55
[40]:
# If you want to include all logs before lecture 2,  set the argument 'include_other_lecture_time' to True
stream_before_lecture2_include_other_lecture_time = la.select_by_lecture_time(course_info, event_stream_,
                                                                              lecture_week=2, timing="before",
                                                                              include_other_lecture_time=True)
[41]:
stream_before_lecture2_include_other_lecture_time.df  # from the begin of logs to the begin of lecture 2 ('2018-04-17 14:50:00')
[41]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
0 A_U1 C1 PREV 10 NaN 0 tablet 2018-04-09 10:57:15
1 A_U1 C1 PREV 9 NaN 0 tablet 2018-04-09 11:00:59
2 A_U1 C1 PREV 8 NaN 0 tablet 2018-04-09 11:03:31
3 A_U1 C1 PREV 30 NaN 0 tablet 2018-04-10 10:14:12
4 A_U1 C1 PREV 29 NaN 0 tablet 2018-04-10 10:27:24
5 A_U1 C1 PREV 28 NaN 0 tablet 2018-04-10 10:27:24
6 A_U1 C1 PREV 27 NaN 0 tablet 2018-04-10 10:37:19
7 A_U1 C1 PREV 26 NaN 0 tablet 2018-04-10 15:13:55
8 A_U1 C1 NEXT 28 NaN 0 tablet 2018-04-10 15:15:34
9 A_U1 C1 PREV 7 NaN 0 tablet 2018-04-10 15:21:19
10 A_U1 C1 NEXT 29 NaN 0 tablet 2018-04-10 15:22:54
11 A_U1 C1 NEXT 30 NaN 0 tablet 2018-04-10 15:24:29
12 A_U1 C1 NEXT 31 NaN 0 tablet 2018-04-10 15:26:12
13 A_U1 C1 PREV 25 NaN 0 tablet 2018-04-10 15:31:14
14 A_U1 C1 NEXT 32 NaN 0 tablet 2018-04-10 15:31:35
15 A_U1 C1 NEXT 33 NaN 0 tablet 2018-04-10 15:34:19
16 A_U1 C1 NEXT 34 NaN 0 tablet 2018-04-10 15:34:58
17 A_U1 C1 NEXT 35 NaN 0 tablet 2018-04-10 15:38:19
18 A_U1 C1 NEXT 12 NaN 0 tablet 2018-04-10 15:40:26
19 A_U1 C1 PREV 6 NaN 0 tablet 2018-04-10 15:41:01
20 A_U1 C1 NEXT 36 NaN 0 tablet 2018-04-10 15:42:10
21 A_U1 C1 NEXT 37 NaN 0 tablet 2018-04-10 15:45:54
22 A_U1 C1 NEXT 38 NaN 0 tablet 2018-04-10 15:47:09
23 A_U1 C1 NEXT 39 NaN 0 tablet 2018-04-10 15:47:46
24 A_U1 C1 NEXT 13 NaN 0 tablet 2018-04-10 15:55:49
25 A_U1 C1 NEXT 8 NaN 0 tablet 2018-04-10 15:56:38
26 A_U1 C1 NEXT 5 NaN 0 tablet 2018-04-10 15:57:56
27 A_U1 C1 NEXT 14 NaN 0 tablet 2018-04-10 15:58:40
28 A_U1 C1 NEXT 9 NaN 0 tablet 2018-04-10 15:58:45
29 A_U1 C1 PREV 24 NaN 0 tablet 2018-04-10 16:00:24
30 A_U1 C1 NEXT 15 NaN 0 tablet 2018-04-10 16:05:57
31 A_U1 C1 NEXT 6 NaN 0 tablet 2018-04-10 16:07:44
32 A_U1 C1 NEXT 16 NaN 0 tablet 2018-04-10 16:15:31
33 A_U1 C1 NEXT 16 NaN 0 tablet 2018-04-10 16:15:34
34 A_U1 C1 NEXT 7 NaN 0 tablet 2018-04-10 16:17:20
35 A_U1 C1 PREV 23 NaN 0 tablet 2018-04-10 16:19:10
36 A_U1 C1 CLOSE 53 NaN 0 tablet 2018-04-10 16:21:24
37 A_U1 C1 NEXT 23 NaN 0 tablet 2018-04-10 16:22:21
38 A_U1 C1 NEXT 24 NaN 0 tablet 2018-04-10 16:22:39
39 A_U1 C1 NEXT 25 NaN 0 tablet 2018-04-10 16:22:56
40 A_U1 C1 PAGE_JUMP 1 NaN 0 tablet 2018-04-10 16:51:07
41 A_U1 C1 OPEN 1 NaN 0 tablet 2018-04-10 16:51:28
42 A_U1 C1 NEXT 29 NaN 0 tablet 2018-04-17 00:05:16
43 A_U1 C1 NEXT 30 NaN 0 tablet 2018-04-17 00:05:33
44 A_U1 C1 BOOKMARK_JUMP 53 NaN 0 tablet 2018-04-17 14:00:03
45 A_U1 C1 PAGE_JUMP 1 NaN 0 tablet 2018-04-17 14:00:44
46 A_U1 C1 OPEN 1 NaN 0 tablet 2018-04-17 14:49:55

After lecture

[54]:
stream_after_lecture = la.select_by_lecture_time(course_info, event_stream_, lecture_week=2,  timing="after")
[55]:
lecture_end
[55]:
Timestamp('2018-04-17 16:20:00')
[56]:
stream_after_lecture.df # between the end of lecture 2 and the begin of lecture 3 ('2018-04-17 16:20:00' - '2018-04-24 14:50:00')
[56]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
140 A_U1 C1 NEXT 17 NaN 0 tablet 2018-04-24 14:06:08
141 A_U1 C1 NEXT 49 NaN 0 tablet 2018-04-24 14:40:09
142 A_U1 C1 NEXT 50 NaN 0 tablet 2018-04-24 14:40:46
143 A_U1 C1 NEXT 51 NaN 0 tablet 2018-04-24 14:40:52
144 A_U1 C1 NEXT 52 NaN 0 tablet 2018-04-24 14:41:00
[57]:
# If you want to include all logs after lecture 2,  set the argument 'include_other_lecture_time' to True
stream_after_lecture2_include_other_lecture_time = la.select_by_lecture_time(course_info, event_stream_,
                                                                             lecture_week=2, timing="after",
                                                                             include_other_lecture_time=True)
[58]:
stream_after_lecture2_include_other_lecture_time.df # from the end of lecture 2 ('2018-04-17 16:20:00') to the end of logs.
[58]:
userid contentsid operationname pageno marker memo_length devicecode eventtime
140 A_U1 C1 NEXT 17 NaN 0 tablet 2018-04-24 14:06:08
141 A_U1 C1 NEXT 49 NaN 0 tablet 2018-04-24 14:40:09
142 A_U1 C1 NEXT 50 NaN 0 tablet 2018-04-24 14:40:46
143 A_U1 C1 NEXT 51 NaN 0 tablet 2018-04-24 14:40:52
144 A_U1 C1 NEXT 52 NaN 0 tablet 2018-04-24 14:41:00
... ... ... ... ... ... ... ... ...
467 A_U1 C1 PREV 55 NaN 0 tablet 2018-06-05 16:06:27
468 A_U1 C1 PREV 54 NaN 0 tablet 2018-06-05 16:07:32
469 A_U1 C1 PREV 11 NaN 0 tablet 2018-06-05 16:07:49
470 A_U1 C1 NEXT 23 NaN 0 tablet 2018-06-05 16:22:51
471 A_U1 C1 PREV 24 NaN 0 tablet 2018-06-05 16:24:36

332 rows × 8 columns

Other than the above functions, following functions are available for EventStream

  • select_operation

  • select_marker_type

  • select_device

  • select_page

  • select_memo_length

  • select_time

[ ]: